info_healthy <- read_xlsx ("data/raw/final_health_statistic.xlsx") #metadata of healthy subjects
info_ibs <- read_xlsx ("data/raw/final_ibs_141_statistic.xlsx") ##metadata of IBS subjects
#Combining `info_healthy` and `info_ibs` into a single dataset `combined_info` with subsequent content editing
combined_info <- info_healthy %>%
bind_rows(info_ibs) %>%
mutate (
BMI_min = ifelse (is.na (BMI_min), round (Weight_min /(Height_max/100 * Height_max/100), 2), BMI_min),
BMI_max = ifelse (is.na (BMI_max), round (Weight_max /(Height_min/100 * Height_min/100), 2), BMI_max)
) %>%
unite("BMI_range", BMI_min, BMI_max, sep = "-", na.rm = TRUE) %>%
unite ("Age_range", Age_min, Age_max, sep = "-", na.rm = TRUE) %>%
mutate(
Age_range = case_when(
Age_range == "18-40" | Age_range == "23-28" | Age_range == "16-42" | Age_range == "21-43" ~ "16-43",
Age <= 43 ~ "16-43",
Age > 43 ~ "> 43",
TRUE ~ NA_character_), #group 28-54 has been deleted
research_ID = sub ("research_", "", research_ID),
research_ID = case_when(
research_ID == 0 ~ 1,
research_ID == 1 ~ 2,
research_ID == 2 ~ 3,
research_ID == 3 ~ 4,
research_ID == 4 ~ 5,
research_ID == 6 ~ 6,
research_ID == 7 ~ 7),
patient_ID = row_number(),
Sex = ifelse (Sex == "mixed", NA, Sex),
Smoking = sub ("never", "Never", Smoking),
Smoking = case_when(
Smoking == "No" ~ "No",
Smoking == "Never" ~ "No",
Smoking == "Rarely (a few times/month)" ~ "Yes", #5 people
Smoking == "Occasionally (1-2 times/week)" ~ "Yes", #3 people
Smoking == "Regularly (3-5 times/week)" ~ "Yes", #1 people
Smoking == "Daily" ~ "Yes"), #7 чел.
Alcohol = sub ("rarely", "Rarely", Alcohol),
Alcohol = ifelse(Alcohol == "Regularly (3-5 times/week)"|
Alcohol == "Daily", #3 чел.
"Regularly (3-7 times/week)", Alcohol),
Antibiotics_usage = case_when(
Antibiotics_usage == "Month" | Antibiotics_usage == ~ "3 months" |
Antibiotics_usage == "6 months" ~ "1-6 months",
#2 IBS people - Month, 0 healthy people - 3 months, 0 healthy people - 6 months
Antibiotics_usage == "Year" | Antibiotics_usage == "Not use" ~
"12 months/Not use"), # 0 healthy people - 6-12 months, zero IBS people - Not use
Hygiene = case_when(
Hygiene == "Occasionally (1-2 times/week) cosmetics" ~ "Occasionally cosmetics (1-2 times/week)",
Hygiene == "Rarely (a few times/month) cosmetics" ~ "Rarely cosmetics (a few times/month)",
TRUE ~ Hygiene),
Hygiene = case_when(
Hygiene == "Daily cosmetics"|Hygiene == "Regularly cosmetics (3-5 times/week)" ~ "Regularly (3-7 times/week)", #0 чел. больных для 3-5 times/week
Hygiene == "Occasionally cosmetics (1-2 times/week)" | Hygiene == "Rarely cosmetics (a few times/month)" ~ "Occasionally (a few-8 times/month)",
#только 2 чел. больных для 1-2 times/week
Hygiene == "Never cosmetics" ~ "Never"),
Physical_activity = sub ("regularly", "Regularly", Smoking),
BMI = ifelse (is.na(Weight_kg), BMI, Weight_kg/ (Height_cm/100 * Height_cm/100)),
BMI_range = ifelse(BMI_range == "", NA, BMI_range),
BMI_category = case_when(
BMI_range == "18-25" ~ "normal/overweight",
BMI_range == "19.21-29.29" ~ "normal/overweight",
BMI_range == "20.6-29.6" ~ "normal/overweight",
BMI_range == "21.74-28.38" ~ "normal/overweight",
BMI_range == "18.5-30.8" ~ "normal/overweight", #a little over 30
BMI < 18.5 ~ "underweight",
BMI >= 18.5 & BMI < 30 ~ "normal/overweight",
BMI >= 30 ~ "obese")
) %>%
rename ("Cosmetics" = Hygiene ) %>%
mutate_if (is.character, as.factor) %>%
select(-c(
Instrument, # unique (combined_info$Instrument) = "Illumina MiSeq"
Isolation_source, # unique (combined_info$Isolation_source) = "faeces"
Assay_type, # unique (combined_info$Assay_type) = "AMPLICON"
Target_gene, # unique (combined_info$Target_gene) = "16S"
Main_Disease, # unique (combined_info$Main_Disease) = NA (for healthy), 141 (for IBS)
Drugs, # unique (combined_info$Drugs) = NA
Social_status, # unique (combined_info$Social_status) = NA, urban
Weight_kg, Height_cm, Weight_min, Weight_max, Height_min, Height_max, BMI_range, #have been used to create the variable "BMI_category" and to reduce the amount of NA in "BMI"
BMI, # NA for all healthy people
Birth_Year, # no additional information
Pets_type # unique (combined_info$Pets_type = cat, NA
))
rm (info_healthy, info_ibs)
#install.packages("summarytools")
library(summarytools)
saved_x11_option <- st_options("use.x11")
st_options(use.x11 = FALSE)
combined_info %>%
select(- patient_ID) %>%
dfSummary() %>%
print (method = "render")
| No | Variable | Stats / Values | Freqs (% of Valid) | Valid | Missing | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | research_ID [numeric] |
|
|
381 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 2 | Seq_region [factor] |
|
|
381 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 3 | Seq_date [numeric] |
|
|
381 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 4 | Health_state [factor] |
|
|
381 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 5 | Age [numeric] |
|
45 distinct values | 134 (35.2%) | 247 (64.8%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 6 | Age_range [factor] |
|
|
286 (75.1%) | 95 (24.9%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 7 | Sex [factor] |
|
|
191 (50.1%) | 190 (49.9%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 8 | Country [factor] |
|
|
381 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 9 | Race [factor] |
|
|
130 (34.1%) | 251 (65.9%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 10 | Smoking [factor] |
|
|
203 (53.3%) | 178 (46.7%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 11 | Alcohol [factor] |
|
|
87 (22.8%) | 294 (77.2%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 12 | Antibiotics_usage [factor] |
|
|
177 (46.5%) | 204 (53.5%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 13 | Physical_activity [factor] |
|
|
203 (53.3%) | 178 (46.7%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 14 | Travel_period [factor] |
|
|
86 (22.6%) | 295 (77.4%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 15 | Education_level [factor] |
|
|
84 (22.0%) | 297 (78.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 16 | Cosmetics [factor] |
|
|
86 (22.6%) | 295 (77.4%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 17 | Sleep_duration [factor] |
|
|
87 (22.8%) | 294 (77.2%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 18 | Diet_type [factor] |
|
|
18 (4.7%) | 363 (95.3%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 19 | Diet_duration [factor] |
|
|
13 (3.4%) | 368 (96.6%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 20 | Additive_usage [factor] |
|
|
11 (2.9%) | 370 (97.1%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 21 | BMI_category [factor] |
|
|
293 (76.9%) | 88 (23.1%) |
Generated by summarytools 1.0.1 (R version 4.3.2)
2024-02-11
combined_info %>%
select (research_ID, Country, Seq_date, Seq_region, Health_state) %>%
mutate(Country = if_else (research_ID == 4, "Australia, Austria, Germany, Greece, Hungary, Ireland, Israel, Italy, Norway, UK, USA", Country),
Seq_date = ifelse (research_ID == 4, "2018-2023", Seq_date),
Health_state = if_else (research_ID == 4, "Disease/Health", Health_state)) %>%
group_by(research_ID, Country, Seq_date, Seq_region, Health_state) %>%
summarise(n = n()) %>%
flextable() %>% theme_box() %>%
merge_v("Health_state") %>%
align(align = "center", part = "all") %>%
set_caption("General characteristics of the included studies")
research_ID | Country | Seq_date | Seq_region | Health_state | n |
|---|---|---|---|---|---|
1 | Belgium | 2018 | V5-V6 | Health | 46 |
2 | Italy | 2018 | V3-V4 | 36 | |
3 | Poland | 2023 | V3-V4 | 70 | |
4 | Australia, Austria, Germany, Greece, Hungary, Ireland, Israel, Italy, Norway, UK, USA | 2018-2023 | V4 | Disease/Health | 87 |
5 | Austria | 2017 | V4 | Disease | 25 |
6 | Israel | 2022 | V4 | 22 | |
7 | Spain | 2015 | V4 | 95 |
library(gtsummary)
combined_info %>%
select(! c(patient_ID,
Diet_duration, Additive_usage, Diet_type #for all healthy subjects = NA
)) %>%
tbl_summary(digits = list(all_continuous() ~ c(0, 0),
all_categorical() ~ c(0, 0)),
by = Health_state) %>%
add_p()
| Characteristic | Disease, N = 1701 | Health, N = 2111 | p-value2 |
|---|---|---|---|
| research_ID | <0.001 | ||
| 1 | 0 (0%) | 46 (22%) | |
| 2 | 0 (0%) | 36 (17%) | |
| 3 | 0 (0%) | 70 (33%) | |
| 4 | 28 (16%) | 59 (28%) | |
| 5 | 25 (15%) | 0 (0%) | |
| 6 | 22 (13%) | 0 (0%) | |
| 7 | 95 (56%) | 0 (0%) | |
| Seq_region | <0.001 | ||
| V3-V4 | 0 (0%) | 106 (50%) | |
| V4 | 170 (100%) | 59 (28%) | |
| V5-V6 | 0 (0%) | 46 (22%) | |
| Seq_date | <0.001 | ||
| 2015 | 95 (56%) | 0 (0%) | |
| 2017 | 25 (15%) | 0 (0%) | |
| 2018 | 14 (8%) | 82 (39%) | |
| 2020 | 3 (2%) | 36 (17%) | |
| 2021 | 9 (5%) | 23 (11%) | |
| 2022 | 23 (14%) | 0 (0%) | |
| 2023 | 1 (1%) | 70 (33%) | |
| Age | 42 (32, 50) | 28 (26, 37) | <0.001 |
| Unknown | 95 | 152 | |
| Age_range | <0.001 | ||
| > 43 | 34 (45%) | 10 (5%) | |
| 16-43 | 41 (55%) | 201 (95%) | |
| Unknown | 95 | 0 | |
| Sex | 0.15 | ||
| female | 35 (48%) | 44 (37%) | |
| male | 38 (52%) | 74 (63%) | |
| Unknown | 97 | 93 | |
| Country | |||
| Australia | 0 (0%) | 6 (3%) | |
| Austria | 25 (15%) | 3 (1%) | |
| Belgium | 0 (0%) | 46 (22%) | |
| Germany | 0 (0%) | 45 (21%) | |
| Greece | 0 (0%) | 1 (0%) | |
| Hungary | 0 (0%) | 2 (1%) | |
| Ireland | 1 (1%) | 0 (0%) | |
| Israel | 22 (13%) | 1 (0%) | |
| Italy | 0 (0%) | 37 (18%) | |
| Norway | 1 (1%) | 0 (0%) | |
| Poland | 0 (0%) | 70 (33%) | |
| Spain | 95 (56%) | 0 (0%) | |
| UK | 12 (7%) | 0 (0%) | |
| USA | 14 (8%) | 0 (0%) | |
| Race | 0.4 | ||
| African American | 0 (0%) | 1 (1%) | |
| Asian or Pacific Islander | 0 (0%) | 1 (1%) | |
| Caucasian | 27 (100%) | 89 (86%) | |
| Hispanic | 0 (0%) | 1 (1%) | |
| Other | 0 (0%) | 11 (11%) | |
| Unknown | 143 | 108 | |
| Smoking | 4 (14%) | 12 (7%) | 0.2 |
| Unknown | 142 | 36 | |
| Alcohol | 0.002 | ||
| Never | 4 (14%) | 8 (14%) | |
| Occasionally (1-2 times/week) | 4 (14%) | 20 (34%) | |
| Rarely (a few times/month) | 9 (32%) | 27 (46%) | |
| Regularly (3-7 times/week) | 11 (39%) | 4 (7%) | |
| Unknown | 142 | 152 | |
| Antibiotics_usage | 0.020 | ||
| 1-6 months | 2 (8%) | 46 (30%) | |
| 12 months/Not use | 23 (92%) | 106 (70%) | |
| Unknown | 145 | 59 | |
| Physical_activity | 4 (14%) | 12 (7%) | 0.2 |
| Unknown | 142 | 36 | |
| Travel_period | 0.066 | ||
| 3 months | 3 (11%) | 15 (26%) | |
| 6 months | 1 (4%) | 9 (16%) | |
| Month | 4 (14%) | 9 (16%) | |
| Year | 20 (71%) | 25 (43%) | |
| Unknown | 142 | 153 | |
| Education_level | <0.001 | ||
| Bachelor's level | 5 (18%) | 26 (46%) | |
| Master's level | 18 (64%) | 10 (18%) | |
| School Level | 5 (18%) | 20 (36%) | |
| Unknown | 142 | 155 | |
| Cosmetics | 0.6 | ||
| Never | 15 (56%) | 29 (49%) | |
| Occasionally (a few-8 times/month) | 6 (22%) | 11 (19%) | |
| Regularly (3-7 times/week) | 6 (22%) | 19 (32%) | |
| Unknown | 143 | 152 | |
| Sleep_duration | 0.2 | ||
| > 8 hours | 4 (14%) | 5 (8%) | |
| 4-6 hours | 4 (14%) | 5 (8%) | |
| 6-7 hours | 12 (43%) | 19 (32%) | |
| 7-8 hours | 8 (29%) | 30 (51%) | |
| Unknown | 142 | 152 | |
| BMI_category | 0.012 | ||
| normal/overweight | 135 (96%) | 152 (100%) | |
| obese | 5 (4%) | 0 (0%) | |
| underweight | 1 (1%) | 0 (0%) | |
| Unknown | 29 | 59 | |
| 1 n (%); Median (IQR) | |||
| 2 Pearson’s Chi-squared test; Wilcoxon rank sum test; Fisher’s exact test | |||
###combined_bacteria and data_wide (not batched) datasets
bacteria_healthy <- read_csv("data/raw/final_bacteria_health.csv") #abundance table for healthy
bacteria_ibs <- read_csv("data/raw/final_bacteria_ibs_141.csv") #abundance table for healthy
combined_bacteria <- bacteria_healthy %>%
bind_rows (bacteria_ibs) %>%
mutate(patient_ID = row_number())
rm (bacteria_healthy, bacteria_ibs)
# Combining `combined_info` and `combined_bacteria` into `data_wide`
data_wide <- combined_info %>% left_join (combined_bacteria)
data_wide %>%
select (where (function(x) sum (is.na(x))/ nrow(data_wide) * 100 > 0)) %>%
sapply (function(x) sum (is.na(x))/ nrow(data_wide) * 100) %>% round(1) %>%
as.data.frame() %>%
rename(NA_percentage = ".") %>%
mutate (
"Number of people with known data" = round (nrow(data_wide) - NA_percentage/100 * nrow(data_wide)),
NA_percentage = paste (NA_percentage, "%", sep = " ")
) %>%
arrange(desc (NA_percentage)) %>%
rownames_to_column() %>%
as_tibble() %>% flextable()
rowname | NA_percentage | Number of people with known data |
|---|---|---|
Additive_usage | 97.1 % | 11 |
Diet_duration | 96.6 % | 13 |
Diet_type | 95.3 % | 18 |
Education_level | 78 % | 84 |
Travel_period | 77.4 % | 86 |
Cosmetics | 77.4 % | 86 |
Alcohol | 77.2 % | 87 |
Sleep_duration | 77.2 % | 87 |
Race | 65.9 % | 130 |
Age | 64.8 % | 134 |
Antibiotics_usage | 53.5 % | 177 |
Sex | 49.9 % | 191 |
Smoking | 46.7 % | 203 |
Physical_activity | 46.7 % | 203 |
Age_range | 24.9 % | 286 |
BMI_category | 23.1 % | 293 |
data_wide# Устанавливаем порог процента
threshold_percent <- 95
# Функция для вычисления процента записей, не равных NA и не равных 0, для каждой колонки
calculate_percentage <- function(col) {
sum(!is.na(col) & col != 0) / length(col) * 100
}
# Применяем функцию к каждой колонке в датасете
percentage_non_zero_non_na <- sapply(data_wide[, -1], calculate_percentage)
# Создаем датафрейм с результатами
result_df_sort <- data.frame(
column = names(percentage_non_zero_non_na),
percentage = round(100 - percentage_non_zero_non_na, 2) # percentage означает пропущенные или 0 значения
) %>%
arrange(desc(percentage))
# Отфильтровываем колонки, у которых процент записей менее threshold_percent%
filtered_columns <- result_df_sort[result_df_sort$percentage < threshold_percent, ]
# Сохраним датасет в excel для дальнейшего анализа
write.xlsx(filtered_columns,
file = "data/originals/percentage_by_vars.xlsx")
# Перезапись data_wide с выбором колонок с процентом NA/0 менее threshold_percent
data_wide <- data_wide %>%
select (row.names(filtered_columns), research_ID)
rm (calculate_percentage, result_df_sort)
data_wide# Устанавливаем порог процента
threshold_percent <- 95
# Рассчитываем процент значений, не являющихся NA и не равных 0, для каждого пациента
percentage_non_zero_non_na <- rowMeans(!is.na(data_wide) & data_wide != 0, na.rm = TRUE) * 100
# Создаем датафрейм с результатами
result_df_sort <- data.frame(
patient_id = data_wide$patient_ID,
percentage = round(100 - percentage_non_zero_non_na, 2) # percentage означает пропущенные или 0 значения
) %>% arrange(desc(percentage))
# Отфильтровываем пациентов, у которых процент значений менее threshold_percent%
filtered_patients <- result_df_sort[result_df_sort$percentage < threshold_percent, ]
# Сохраним датасет в excel для дальнейшего анализа
write.xlsx(filtered_patients,
file = "data/originals/percentage_by_patient.xlsx")
# Перезапись data_wide с удалением строк с процентом NA/0 более threshold_percent
data_wide <- data_wide %>%
slice (filtered_patients$patient_id) #при threshold_percent = 95%, изменения data_wide не происходит, так как нет пациентов с процентом NA/0 более 95%
rm (percentage_non_zero_non_na, result_df_sort)
# Deleting columns and rows with a NA/0-percentage more than threshold_percent from `data_wide`
data_wide <- data_wide %>%
select (patient_ID, any_of (colnames(combined_info)), everything()) %>%
arrange(patient_ID)
G_only_one <- data_wide %>%
select (research_ID, ends_with("_G")) %>%
add_column(n = 1) %>% #for further counting the subjects number in each study
group_by(research_ID) %>%
summarise_each (sum) %>%
select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>% #selecting "single study taxa"
mutate (across(-c(1:2), function(x) x/n)) %>%
mutate (across(-c(1:2), function(x) round (x,3)))
G_only_one %>% as_tibble() %>% flextable() %>% set_caption("Mean percentage of single study G-taxa")
research_ID | n | CM1G08_G | Cladosporium_G | Lentimonas_G | Micromonospora_G | Pseudosphingobacterium_G | Schizothrix LEGE 07164_G | Talaromyces_G | Rs-D38 termite group_G | Kabatiella_G | Anaerolineaceae UCG-001_G | Iamia_G | Thiohalocapsa_G | Ellin517_G |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 46 | 0.000 | 0.000 | 0.05 | 0.033 | 0.017 | 0.011 | 0.000 | 0.000 | 0.000 | 0.07 | 0.03 | 0.274 | 0.000 |
2 | 36 | 0.000 | 0.008 | 0.00 | 0.000 | 0.000 | 0.000 | 0.014 | 0.000 | 0.022 | 0.00 | 0.00 | 0.000 | 0.000 |
3 | 70 | 0.000 | 0.000 | 0.00 | 0.000 | 0.000 | 0.000 | 0.000 | 0.005 | 0.000 | 0.00 | 0.00 | 0.000 | 0.000 |
4 | 87 | 0.006 | 0.000 | 0.00 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.00 | 0.00 | 0.000 | 0.000 |
5 | 25 | 0.000 | 0.000 | 0.00 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.00 | 0.00 | 0.000 | 0.000 |
6 | 22 | 0.000 | 0.000 | 0.00 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.00 | 0.00 | 0.000 | 0.000 |
7 | 95 | 0.000 | 0.000 | 0.00 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.00 | 0.00 | 0.000 | 1.594 |
#Determining the percentage of people in whom these taxa have not been detected
G_taxon <- 'Ellin517_G'
res_ID <- 7
#data_wide %>%
#select (research_ID, G_taxon) %>%
#filter (research_ID == res_ID) %>%
#summarise(across (-1,
#function (x) sum (x==0)/nrow (.) * 100)) %>%
#rename ("Taxon_zero_percentage, %" = G_taxon)
# CM1G08_G - 73,6% нулей
# Cladosporium_G - 94,0% нулей
# Lentimonas_G - 94,0% нулей
# Micromonospora_G - 93,2% нулей
# Pseudosphingobacterium_G - 93,2% нулей
# Schizothrix LEGE 07164_G - 92,7% нулей
# Talaromyces_G - 92,7% нулей
# Rs-D38 termite group_G - 91,9% нулей
# Kabatiella_G - 90,6% нулей
# Anaerolineaceae UCG-001_G - 89,5% нулей
# Iamia_G - 88,2% нулей
# Thiohalocapsa_G - 87,9% нулей
# Ellin517_G - 75,1% нулей
data_wide %>%
select (research_ID, ends_with("_F")) %>%
add_column(n = 1) %>% # for further counting the subjects number in each study
group_by(research_ID) %>%
summarise_each (sum) %>%
select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>% #selecting "single study taxa"
mutate (across(-c(1:2), function(x) x/n)) %>%
mutate (across(-c(1:2), function(x) round (x,3))) %>%
as_tibble() %>% flextable() %>% set_caption("Mean percentage of single study F-taxa")
research_ID | n | Didymellaceae_F | Cladosporiaceae_F | Trichocomaceae_F | type III_F | 09D2Z48_F | Aureobasidiaceae_F | Iamiaceae_F |
|---|---|---|---|---|---|---|---|---|
1 | 46 | 0.000 | 0.000 | 0.000 | 0.011 | 0.012 | 0.000 | 0.03 |
2 | 36 | 0.021 | 0.008 | 0.014 | 0.000 | 0.000 | 0.022 | 0.00 |
3 | 70 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.00 |
4 | 87 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.00 |
5 | 25 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.00 |
6 | 22 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.00 |
7 | 95 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.00 |
#Determining the percentage of people in whom these taxa have not been detected
#F_taxon <- 'Iamiaceae_F'
#res_ID <- 1
#data_wide %>%
#select (research_ID, F_taxon) %>%
#filter (research_ID == res_ID) %>%
#summarise(across (-1,
#function (x) sum (x==0)/nrow (.) * 100)) %>%
#rename ("Taxon_zero_percentage, %" = F_taxon)
# Didymellaceae_F - 44,4% нулей
# Cladosporiaceae_F - 36,1% нулей
# Trichocomaceae_F - 22,2% нулей
# type III_F - 30,4% нулей
# 09D2Z48_F - 23,9% нулей
# Aureobasidiaceae_F - 0% нулей
# Iamiaceae_F - 2,2% нулей
data_wide %>%
select (research_ID, ends_with("_O")) %>%
add_column(n = 1) %>% #for further counting the subjects number in each study
group_by(research_ID) %>%
summarise_each (sum) %>%
select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>% #selecting "single study taxa"
mutate (across(-c(1:2), function(x) x/n)) %>%
mutate (across(-c(1:2), function(x) round (x,3))) %>%
as_tibble() %>% flextable() %>% set_caption("Mean percentage of single study O-taxa")
research_ID | n | Hypocreales_O | Pleosporales_O | Candidatus Abawacabacteria_O | Candidatus Peregrinibacteria_O | Capnodiales_O | Candidatus Terrybacteria_O | Eurotiales_O | Dothideales_O | eub62A3_O | LD1-PA32_O |
|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 46 | 0.000 | 0.000 | 0.009 | 0.008 | 0.000 | 0.008 | 0.000 | 0.000 | 0.013 | 0.051 |
2 | 36 | 0.062 | 0.037 | 0.000 | 0.000 | 0.012 | 0.000 | 0.014 | 0.027 | 0.000 | 0.000 |
3 | 70 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
4 | 87 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
5 | 25 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
6 | 22 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
7 | 95 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
#Determining the percentage of people in whom these taxa have not been detected
#O_taxon <- 'Candidatus Abawacabacteria_O'
#res_ID <- 1
#data_wide %>%
#select (research_ID, O_taxon) %>%
#filter (research_ID == res_ID) %>%
#summarise(across (-1, function (x) sum (x==0)/nrow (.) * 100)) %>%
#rename ("Taxon_zero_percentage, %" = O_taxon)
# Hypocreales_O - 41,7% нулей
# Pleosporales_O - 41,7% нулей
# Candidatus Abawacabacteria_O - 52,2% нулей
# Candidatus Peregrinibacteria_O - 52,2% нулей
# Capnodiales_O - 33,3% нулей
# Candidatus Terrybacteria_O - 39,1% нулей
# Eurotiales_O - 39,1% нулей
# Dothideales_O - 0% нулей
# eub62A3_O - 15,2% нулей
# LD1-PA32_O - 2,2% нулей
data_wide %>%
select (research_ID, ends_with("_C")) %>%
add_column(n = 1) %>% #for further counting the subjects number in each study
group_by(research_ID) %>%
summarise_each (sum) %>%
select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>% #selecting "single study taxa"
mutate (across(-c(1:2), function(x) x/n)) %>%
mutate (across(-c(1:2), function(x) round (x,3))) %>%
as_tibble() %>% flextable() %>% set_caption("Mean percentage of single study C-taxa")
research_ID | n | WWE3_C | Eurotiomycetes_C | Dothideomycetes_C |
|---|---|---|---|---|
1 | 46 | 0.000 | 0.000 | 0.000 |
2 | 36 | 0.000 | 0.023 | 0.102 |
3 | 70 | 0.000 | 0.000 | 0.000 |
4 | 87 | 0.014 | 0.000 | 0.000 |
5 | 25 | 0.000 | 0.000 | 0.000 |
6 | 22 | 0.000 | 0.000 | 0.000 |
7 | 95 | 0.000 | 0.000 | 0.000 |
#Determining the percentage of people in whom these taxa have not been detected
#С_taxon <- 'Dothideomycetes_C'
#res_ID <- 2
#data_wide %>%
#select (research_ID, С_taxon) %>%
#filter (research_ID == res_ID) %>%
#summarise(across (-1, function (x) sum (x==0)/nrow (.) * 100)) %>%
#rename ("Taxon_zero_percentage, %" = С_taxon)
# WWE3_C - 74,4% нулей
# Eurotiomycetes_C - 19,4% нулей
# Dothideomycetes_C - 0% нулей
“Single study P-taxa” have not been detected
data_wide <- data_wide %>%
select (!colnames (G_only_one) [- c (1,2)]) #deleting of single study G-taxa (in this single study the taxa were present in no more than 25% of subjects)
data_wide_not_batched <- data_wide
write_rds(data_wide,
file = "data/originals/data_wide_not_batched.rds")
clusters_number <- 7 #number of clusters
library(cluster)
library(factoextra)
G_scaled <- data_wide %>%
mutate(patient_ID = paste0("patient_", patient_ID)) %>%
column_to_rownames("patient_ID") %>%
select (ends_with("_G")) %>%
scale ()
agnes <- G_scaled %>%
dist (method = "euclidean") %>%
as.matrix() %>%
agnes( #Agglomerative clustering
diss = TRUE, #dissimilarity matrix
method = "ward")
fviz_dend(agnes,
k = clusters_number,
rect = TRUE,
k_colors = "jco",
show_labels = TRUE,
label_cols = ifelse (data_wide[agnes$order,]$Health_state == "Health", "green", "red"),
#labels colors = Health_state
cex = 0.2,
main = "Cluster dengrogram based on G-taxa abundance before batch effect correction",
ylab = "")
data_wide %>%
add_column("Cluster" = cutree(agnes, k = clusters_number)) %>%
select(any_of(colnames (combined_info)), Cluster, - patient_ID) %>%
tbl_summary(digits = list(all_continuous() ~ c(0, 0),
all_categorical() ~ c(0, 0)),
by = Cluster) %>%
add_p()
| Characteristic | 1, N = 461 | 2, N = 311 | 3, N = 1891 | 4, N = 231 | 5, N = 651 | 6, N = 21 | 7, N = 251 | p-value2 |
|---|---|---|---|---|---|---|---|---|
| research_ID | ||||||||
| 1 | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2 | 0 (0%) | 31 (100%) | 4 (2%) | 1 (4%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 3 | 0 (0%) | 0 (0%) | 2 (1%) | 3 (13%) | 65 (100%) | 0 (0%) | 0 (0%) | |
| 4 | 0 (0%) | 0 (0%) | 66 (35%) | 19 (83%) | 0 (0%) | 2 (100%) | 0 (0%) | |
| 5 | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 25 (100%) | |
| 6 | 0 (0%) | 0 (0%) | 22 (12%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 7 | 0 (0%) | 0 (0%) | 95 (50%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Seq_region | ||||||||
| V3-V4 | 0 (0%) | 31 (100%) | 6 (3%) | 4 (17%) | 65 (100%) | 0 (0%) | 0 (0%) | |
| V4 | 0 (0%) | 0 (0%) | 183 (97%) | 19 (83%) | 0 (0%) | 2 (100%) | 25 (100%) | |
| V5-V6 | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Seq_date | ||||||||
| 2015 | 0 (0%) | 0 (0%) | 95 (50%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2017 | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 25 (100%) | |
| 2018 | 46 (100%) | 31 (100%) | 11 (6%) | 7 (30%) | 0 (0%) | 1 (50%) | 0 (0%) | |
| 2020 | 0 (0%) | 0 (0%) | 36 (19%) | 2 (9%) | 0 (0%) | 1 (50%) | 0 (0%) | |
| 2021 | 0 (0%) | 0 (0%) | 21 (11%) | 11 (48%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2022 | 0 (0%) | 0 (0%) | 23 (12%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2023 | 0 (0%) | 0 (0%) | 3 (2%) | 3 (13%) | 65 (100%) | 0 (0%) | 0 (0%) | |
| Health_state | ||||||||
| Disease | 0 (0%) | 0 (0%) | 134 (71%) | 10 (43%) | 0 (0%) | 1 (50%) | 25 (100%) | |
| Health | 46 (100%) | 31 (100%) | 55 (29%) | 13 (57%) | 65 (100%) | 1 (50%) | 0 (0%) | |
| Age | NA (NA, NA) | NA (NA, NA) | 32 (27, 42) | 49 (43, 52) | NA (NA, NA) | 32 (32, 33) | 39 (28, 49) | 0.002 |
| Unknown | 46 | 31 | 101 | 4 | 65 | 0 | 0 | |
| Age_range | ||||||||
| > 43 | 0 (0%) | 0 (0%) | 21 (22%) | 13 (57%) | 0 (0%) | 0 (0%) | 10 (40%) | |
| 16-43 | 46 (100%) | 31 (100%) | 73 (78%) | 10 (43%) | 65 (100%) | 2 (100%) | 15 (60%) | |
| Unknown | 0 | 0 | 95 | 0 | 0 | 0 | 0 | |
| Sex | <0.001 | |||||||
| female | 0 (NA%) | 0 (0%) | 20 (42%) | 5 (24%) | 38 (58%) | 0 (0%) | 16 (64%) | |
| male | 0 (NA%) | 31 (100%) | 28 (58%) | 16 (76%) | 27 (42%) | 1 (100%) | 9 (36%) | |
| Unknown | 46 | 0 | 141 | 2 | 0 | 1 | 0 | |
| Country | ||||||||
| Australia | 0 (0%) | 0 (0%) | 2 (1%) | 4 (17%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Austria | 0 (0%) | 0 (0%) | 0 (0%) | 2 (9%) | 0 (0%) | 1 (50%) | 25 (100%) | |
| Belgium | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Germany | 0 (0%) | 0 (0%) | 45 (24%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Greece | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Hungary | 0 (0%) | 0 (0%) | 0 (0%) | 2 (9%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Ireland | 0 (0%) | 0 (0%) | 0 (0%) | 1 (4%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Israel | 0 (0%) | 0 (0%) | 23 (12%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Italy | 0 (0%) | 31 (100%) | 4 (2%) | 2 (9%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Norway | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Poland | 0 (0%) | 0 (0%) | 2 (1%) | 3 (13%) | 65 (100%) | 0 (0%) | 0 (0%) | |
| Spain | 0 (0%) | 0 (0%) | 95 (50%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| UK | 0 (0%) | 0 (0%) | 8 (4%) | 3 (13%) | 0 (0%) | 1 (50%) | 0 (0%) | |
| USA | 0 (0%) | 0 (0%) | 8 (4%) | 6 (26%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Race | 0.016 | |||||||
| African American | 0 (0%) | 0 (NA%) | 1 (2%) | 0 (0%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Asian or Pacific Islander | 0 (0%) | 0 (NA%) | 1 (2%) | 0 (0%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Caucasian | 46 (100%) | 0 (NA%) | 49 (78%) | 19 (100%) | 0 (NA%) | 2 (100%) | 0 (NA%) | |
| Hispanic | 0 (0%) | 0 (NA%) | 1 (2%) | 0 (0%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Other | 0 (0%) | 0 (NA%) | 11 (17%) | 0 (0%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Unknown | 0 | 31 | 126 | 4 | 65 | 0 | 25 | |
| Smoking | 0 (0%) | 0 (NA%) | 14 (21%) | 2 (9%) | 0 (0%) | 0 (0%) | 0 (NA%) | <0.001 |
| Unknown | 0 | 31 | 121 | 1 | 0 | 0 | 25 | |
| Alcohol | 0.8 | |||||||
| Never | 0 (NA%) | 0 (NA%) | 8 (12%) | 4 (21%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Occasionally (1-2 times/week) | 0 (NA%) | 0 (NA%) | 20 (30%) | 4 (21%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Rarely (a few times/month) | 0 (NA%) | 0 (NA%) | 26 (39%) | 8 (42%) | 0 (NA%) | 2 (100%) | 0 (NA%) | |
| Regularly (3-7 times/week) | 0 (NA%) | 0 (NA%) | 12 (18%) | 3 (16%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Unknown | 46 | 31 | 123 | 4 | 65 | 0 | 25 | |
| Antibiotics_usage | <0.001 | |||||||
| 1-6 months | 46 (100%) | 0 (0%) | 1 (5%) | 1 (8%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| 12 months/Not use | 0 (0%) | 31 (100%) | 21 (95%) | 11 (92%) | 65 (100%) | 1 (100%) | 0 (NA%) | |
| Unknown | 0 | 0 | 167 | 11 | 0 | 1 | 25 | |
| Physical_activity | 0 (0%) | 0 (NA%) | 14 (21%) | 2 (9%) | 0 (0%) | 0 (0%) | 0 (NA%) | <0.001 |
| Unknown | 0 | 31 | 121 | 1 | 0 | 0 | 25 | |
| Travel_period | 0.019 | |||||||
| 3 months | 0 (NA%) | 0 (NA%) | 12 (18%) | 6 (33%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| 6 months | 0 (NA%) | 0 (NA%) | 10 (15%) | 0 (0%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Month | 0 (NA%) | 0 (NA%) | 7 (11%) | 4 (22%) | 0 (NA%) | 2 (100%) | 0 (NA%) | |
| Year | 0 (NA%) | 0 (NA%) | 37 (56%) | 8 (44%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Unknown | 46 | 31 | 123 | 5 | 65 | 0 | 25 | |
| Education_level | 0.006 | |||||||
| Bachelor's level | 0 (NA%) | 0 (NA%) | 26 (41%) | 5 (28%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Master's level | 0 (NA%) | 0 (NA%) | 15 (23%) | 11 (61%) | 0 (NA%) | 2 (100%) | 0 (NA%) | |
| School Level | 0 (NA%) | 0 (NA%) | 23 (36%) | 2 (11%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Unknown | 46 | 31 | 125 | 5 | 65 | 0 | 25 | |
| Cosmetics | 0.2 | |||||||
| Never | 0 (NA%) | 0 (NA%) | 30 (46%) | 13 (68%) | 0 (NA%) | 1 (50%) | 0 (NA%) | |
| Occasionally (a few-8 times/month) | 0 (NA%) | 0 (NA%) | 16 (25%) | 1 (5%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Regularly (3-7 times/week) | 0 (NA%) | 0 (NA%) | 19 (29%) | 5 (26%) | 0 (NA%) | 1 (50%) | 0 (NA%) | |
| Unknown | 46 | 31 | 124 | 4 | 65 | 0 | 25 | |
| Sleep_duration | 0.2 | |||||||
| > 8 hours | 0 (NA%) | 0 (NA%) | 6 (9%) | 3 (16%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| 4-6 hours | 0 (NA%) | 0 (NA%) | 5 (8%) | 4 (21%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| 6-7 hours | 0 (NA%) | 0 (NA%) | 22 (33%) | 8 (42%) | 0 (NA%) | 1 (50%) | 0 (NA%) | |
| 7-8 hours | 0 (NA%) | 0 (NA%) | 33 (50%) | 4 (21%) | 0 (NA%) | 1 (50%) | 0 (NA%) | |
| Unknown | 46 | 31 | 123 | 4 | 65 | 0 | 25 | |
| BMI_category | 0.10 | |||||||
| normal/overweight | 46 (100%) | 31 (100%) | 133 (97%) | 11 (85%) | 65 (100%) | 1 (100%) | 0 (NA%) | |
| obese | 0 (0%) | 0 (0%) | 3 (2%) | 2 (15%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| underweight | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Unknown | 0 | 0 | 52 | 10 | 0 | 1 | 25 | |
| 1 n (%); Median (IQR) | ||||||||
| 2 Kruskal-Wallis rank sum test; Fisher’s exact test | ||||||||
agnes2 <- data_wide %>%
column_to_rownames("patient_ID") %>%
select (ends_with("_G")) %>%
vegdist(method = "bray") %>% #distance matrix for Bray-Curtis dissimilarity:
#the Bray–Curtis dissimilarity is bounded between 0 and 1, where 0 means the two sites have the same composition , and 1 means the two sites do not share any species
as.matrix() %>%
agnes( #Agglomerative clustering
diss = TRUE, #dissimilarity matrix
method = "ward")
fviz_dend(agnes2,
cex = 0.2, k = clusters_number,
rect = TRUE,
k_colors = "jco",
color_labels_by_k = FALSE,
label_cols = ifelse (data_wide[agnes2$order,]$Health_state == "Health", "green", "red"),
#labels colors = Health_state
main = "Cluster dengrogram based on G-taxa abundance before batch effect correction",
ylab = "")
fisher.test.simulate.p.values <- function(data, variable, by, ...) {
result <- list()
test_results <- stats::fisher.test(data[[variable]], data[[by]], simulate.p.value = TRUE)
result$p <- test_results$p.value
result$test <- test_results$method
result
}
data_wide %>%
add_column("Cluster" = cutree(agnes2, k = clusters_number)) %>%
select(any_of(colnames (combined_info)), Cluster, - patient_ID) %>%
tbl_summary(digits = list(all_continuous() ~ c(0, 0),
all_categorical() ~ c(0, 0)),
by = Cluster) %>%
add_p(test = list(all_categorical() ~ "fisher.test.simulate.p.values"))
| Characteristic | 1, N = 461 | 2, N = 701 | 3, N = 1211 | 4, N = 411 | 5, N = 461 | 6, N = 211 | 7, N = 361 | p-value2 |
|---|---|---|---|---|---|---|---|---|
| research_ID | <0.001 | |||||||
| 1 | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2 | 0 (0%) | 19 (27%) | 5 (4%) | 8 (20%) | 3 (7%) | 1 (5%) | 0 (0%) | |
| 3 | 0 (0%) | 25 (36%) | 26 (21%) | 8 (20%) | 9 (20%) | 0 (0%) | 2 (6%) | |
| 4 | 0 (0%) | 6 (9%) | 36 (30%) | 0 (0%) | 9 (20%) | 20 (95%) | 16 (44%) | |
| 5 | 0 (0%) | 2 (3%) | 10 (8%) | 0 (0%) | 0 (0%) | 0 (0%) | 13 (36%) | |
| 6 | 0 (0%) | 4 (6%) | 9 (7%) | 3 (7%) | 4 (9%) | 0 (0%) | 2 (6%) | |
| 7 | 0 (0%) | 14 (20%) | 35 (29%) | 22 (54%) | 21 (46%) | 0 (0%) | 3 (8%) | |
| Seq_region | <0.001 | |||||||
| V3-V4 | 0 (0%) | 44 (63%) | 31 (26%) | 16 (39%) | 12 (26%) | 1 (5%) | 2 (6%) | |
| V4 | 0 (0%) | 26 (37%) | 90 (74%) | 25 (61%) | 34 (74%) | 20 (95%) | 34 (94%) | |
| V5-V6 | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Seq_date | <0.001 | |||||||
| 2015 | 0 (0%) | 14 (20%) | 35 (29%) | 22 (54%) | 21 (46%) | 0 (0%) | 3 (8%) | |
| 2017 | 0 (0%) | 2 (3%) | 10 (8%) | 0 (0%) | 0 (0%) | 0 (0%) | 13 (36%) | |
| 2018 | 46 (100%) | 21 (30%) | 8 (7%) | 8 (20%) | 5 (11%) | 7 (33%) | 1 (3%) | |
| 2020 | 0 (0%) | 2 (3%) | 20 (17%) | 0 (0%) | 5 (11%) | 2 (10%) | 10 (28%) | |
| 2021 | 0 (0%) | 2 (3%) | 11 (9%) | 0 (0%) | 2 (4%) | 12 (57%) | 5 (14%) | |
| 2022 | 0 (0%) | 4 (6%) | 10 (8%) | 3 (7%) | 4 (9%) | 0 (0%) | 2 (6%) | |
| 2023 | 0 (0%) | 25 (36%) | 27 (22%) | 8 (20%) | 9 (20%) | 0 (0%) | 2 (6%) | |
| Health_state | <0.001 | |||||||
| Disease | 0 (0%) | 23 (33%) | 63 (52%) | 25 (61%) | 29 (63%) | 10 (48%) | 20 (56%) | |
| Health | 46 (100%) | 47 (67%) | 58 (48%) | 16 (39%) | 17 (37%) | 11 (52%) | 16 (44%) | |
| Age | NA (NA, NA) | 36 (29, 47) | 33 (27, 48) | 35 (30, 41) | 32 (29, 38) | 48 (42, 56) | 31 (26, 41) | 0.010 |
| Unknown | 46 | 58 | 66 | 38 | 33 | 1 | 5 | |
| Age_range | <0.001 | |||||||
| > 43 | 0 (0%) | 5 (9%) | 15 (17%) | 1 (5%) | 3 (12%) | 13 (62%) | 7 (21%) | |
| 16-43 | 46 (100%) | 51 (91%) | 71 (83%) | 18 (95%) | 22 (88%) | 8 (38%) | 26 (79%) | |
| Unknown | 0 | 14 | 35 | 22 | 21 | 0 | 3 | |
| Sex | 0.027 | |||||||
| female | 0 (NA%) | 18 (34%) | 36 (59%) | 4 (21%) | 7 (37%) | 6 (32%) | 8 (40%) | |
| male | 0 (NA%) | 35 (66%) | 25 (41%) | 15 (79%) | 12 (63%) | 13 (68%) | 12 (60%) | |
| Unknown | 46 | 17 | 60 | 22 | 27 | 2 | 16 | |
| Country | <0.001 | |||||||
| Australia | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 4 (19%) | 1 (3%) | |
| Austria | 0 (0%) | 2 (3%) | 10 (8%) | 0 (0%) | 0 (0%) | 2 (10%) | 14 (39%) | |
| Belgium | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Germany | 0 (0%) | 3 (4%) | 25 (21%) | 0 (0%) | 5 (11%) | 0 (0%) | 12 (33%) | |
| Greece | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (5%) | 0 (0%) | |
| Hungary | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 2 (10%) | 0 (0%) | |
| Ireland | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (5%) | 0 (0%) | |
| Israel | 0 (0%) | 4 (6%) | 10 (8%) | 3 (7%) | 4 (9%) | 0 (0%) | 2 (6%) | |
| Italy | 0 (0%) | 19 (27%) | 5 (4%) | 8 (20%) | 3 (7%) | 2 (10%) | 0 (0%) | |
| Norway | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Poland | 0 (0%) | 25 (36%) | 26 (21%) | 8 (20%) | 9 (20%) | 0 (0%) | 2 (6%) | |
| Spain | 0 (0%) | 14 (20%) | 35 (29%) | 22 (54%) | 21 (46%) | 0 (0%) | 3 (8%) | |
| UK | 0 (0%) | 2 (3%) | 1 (1%) | 0 (0%) | 4 (9%) | 4 (19%) | 1 (3%) | |
| USA | 0 (0%) | 1 (1%) | 7 (6%) | 0 (0%) | 0 (0%) | 5 (24%) | 1 (3%) | |
| Race | <0.001 | |||||||
| African American | 0 (0%) | 0 (0%) | 1 (3%) | 0 (NA%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Asian or Pacific Islander | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | 1 (11%) | 0 (0%) | 0 (0%) | |
| Caucasian | 46 (100%) | 6 (100%) | 28 (80%) | 0 (NA%) | 7 (78%) | 20 (100%) | 9 (64%) | |
| Hispanic | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | 0 (0%) | 0 (0%) | 1 (7%) | |
| Other | 0 (0%) | 0 (0%) | 6 (17%) | 0 (NA%) | 1 (11%) | 0 (0%) | 4 (29%) | |
| Unknown | 0 | 64 | 86 | 41 | 37 | 1 | 22 | |
| Smoking | 0 (0%) | 2 (6%) | 7 (11%) | 0 (0%) | 1 (6%) | 3 (15%) | 3 (17%) | 0.080 |
| Unknown | 0 | 39 | 59 | 33 | 28 | 1 | 18 | |
| Alcohol | 0.7 | |||||||
| Never | 0 (NA%) | 0 (0%) | 5 (14%) | 0 (NA%) | 1 (11%) | 5 (25%) | 1 (6%) | |
| Occasionally (1-2 times/week) | 0 (NA%) | 1 (17%) | 12 (33%) | 0 (NA%) | 1 (11%) | 4 (20%) | 6 (38%) | |
| Rarely (a few times/month) | 0 (NA%) | 4 (67%) | 12 (33%) | 0 (NA%) | 4 (44%) | 8 (40%) | 8 (50%) | |
| Regularly (3-7 times/week) | 0 (NA%) | 1 (17%) | 7 (19%) | 0 (NA%) | 3 (33%) | 3 (15%) | 1 (6%) | |
| Unknown | 46 | 64 | 85 | 41 | 37 | 1 | 20 | |
| Antibiotics_usage | <0.001 | |||||||
| 1-6 months | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (11%) | 1 (25%) | |
| 12 months/Not use | 0 (0%) | 47 (100%) | 39 (100%) | 16 (100%) | 16 (100%) | 8 (89%) | 3 (75%) | |
| Unknown | 0 | 23 | 82 | 25 | 30 | 12 | 32 | |
| Physical_activity | 0 (0%) | 2 (6%) | 7 (11%) | 0 (0%) | 1 (6%) | 3 (15%) | 3 (17%) | 0.089 |
| Unknown | 0 | 39 | 59 | 33 | 28 | 1 | 18 | |
| Travel_period | 0.051 | |||||||
| 3 months | 0 (NA%) | 0 (0%) | 10 (28%) | 0 (NA%) | 1 (11%) | 5 (26%) | 2 (13%) | |
| 6 months | 0 (NA%) | 0 (0%) | 8 (22%) | 0 (NA%) | 1 (11%) | 0 (0%) | 1 (6%) | |
| Month | 0 (NA%) | 0 (0%) | 2 (6%) | 0 (NA%) | 1 (11%) | 4 (21%) | 6 (38%) | |
| Year | 0 (NA%) | 6 (100%) | 16 (44%) | 0 (NA%) | 6 (67%) | 10 (53%) | 7 (44%) | |
| Unknown | 46 | 64 | 85 | 41 | 37 | 2 | 20 | |
| Education_level | 0.15 | |||||||
| Bachelor's level | 0 (NA%) | 1 (20%) | 11 (31%) | 0 (NA%) | 3 (33%) | 6 (32%) | 10 (63%) | |
| Master's level | 0 (NA%) | 1 (20%) | 10 (29%) | 0 (NA%) | 4 (44%) | 10 (53%) | 3 (19%) | |
| School Level | 0 (NA%) | 3 (60%) | 14 (40%) | 0 (NA%) | 2 (22%) | 3 (16%) | 3 (19%) | |
| Unknown | 46 | 65 | 86 | 41 | 37 | 2 | 20 | |
| Cosmetics | 0.8 | |||||||
| Never | 0 (NA%) | 3 (50%) | 18 (50%) | 0 (NA%) | 4 (50%) | 12 (60%) | 7 (44%) | |
| Occasionally (a few-8 times/month) | 0 (NA%) | 2 (33%) | 6 (17%) | 0 (NA%) | 3 (38%) | 2 (10%) | 4 (25%) | |
| Regularly (3-7 times/week) | 0 (NA%) | 1 (17%) | 12 (33%) | 0 (NA%) | 1 (13%) | 6 (30%) | 5 (31%) | |
| Unknown | 46 | 64 | 85 | 41 | 38 | 1 | 20 | |
| Sleep_duration | 0.2 | |||||||
| > 8 hours | 0 (NA%) | 2 (33%) | 2 (6%) | 0 (NA%) | 1 (11%) | 3 (15%) | 1 (6%) | |
| 4-6 hours | 0 (NA%) | 0 (0%) | 2 (6%) | 0 (NA%) | 1 (11%) | 4 (20%) | 2 (13%) | |
| 6-7 hours | 0 (NA%) | 2 (33%) | 12 (33%) | 0 (NA%) | 1 (11%) | 9 (45%) | 7 (44%) | |
| 7-8 hours | 0 (NA%) | 2 (33%) | 20 (56%) | 0 (NA%) | 6 (67%) | 4 (20%) | 6 (38%) | |
| Unknown | 46 | 64 | 85 | 41 | 37 | 1 | 20 | |
| BMI_category | 0.032 | |||||||
| normal/overweight | 46 (100%) | 64 (98%) | 80 (98%) | 41 (100%) | 39 (98%) | 8 (80%) | 9 (100%) | |
| obese | 0 (0%) | 1 (2%) | 2 (2%) | 0 (0%) | 0 (0%) | 2 (20%) | 0 (0%) | |
| underweight | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (3%) | 0 (0%) | 0 (0%) | |
| Unknown | 0 | 5 | 39 | 0 | 6 | 11 | 27 | |
| 1 n (%); Median (IQR) | ||||||||
| 2 Fisher’s Exact Test for Count Data with simulated p-value (based on 2000 replicates); Kruskal-Wallis rank sum test | ||||||||
kmeans <- G_scaled %>%
kmeans(centers = clusters_number,
iter.max = 20, nstart = 35)
fviz_cluster(kmeans,data = G_scaled,
ellipse = FALSE,
show.clust.cent = FALSE,
geom = "point",
main = "kMeans clustering based on G-taxa abundance before batch effect correction")
the MBECS package was used, a step-by-step actions description is available at the link https://github.com/rmolbrich/MBECS
Preliminary report is available in /data/originals/.
research_ID batch-effect#Run corrections
mbec.obj <- mbecRunCorrections(
mbec.obj,
model.vars=c('research_ID', 'Health_state'),
#model.vars=c (batch effect, presumed biological effect of interest)
method = "bat", #batch effect correcting algorithm:
#Batch Mean Centering (bmc) /ComBat (bat)/Remove Batch Effect (rbe)
type = "otu") #which abundance matrix to use
## Found 459 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
#Post report
#mbecReportPost(
#input.obj=mbec.obj,
#model.vars=c('research_ID', 'Health_state'), #model.vars=c (batch effect, presumed biological effect of interest)
#type="otu", # which abundance matrix to use for evaluation: clr/otu/tss
#file.name = "Batch_(research_id)_Correction_Report_otu_bat",
#file.dir = "data/originals/",
#return.data = FALSE)
library (datawizard)
#Retrieve corrrected data
ps.cor.bat <- mbecGetPhyloseq(mbec.obj,
type="cor", #which type of data to add
label="bat") #specifies the name within the list
combined_bacteria_batched <- ps.cor.bat@otu_table %>%
as.data.frame() %>%
data_rotate (rownames = "patient_ID") %>%
mutate (patient_ID = sub ("S", "", patient_ID),
patient_ID = as.numeric(patient_ID))
data_wide_batched <- data_wide %>%
select (any_of(colnames (combined_info))) %>%
left_join(combined_bacteria_batched)
rm (combined_bacteria_batched)
data_wide <- data_wide_batched
write_rds(data_wide,
file = "data/originals/data_wide.rds")
####Principal Component Analysis
plot <- mbecPCA(input.obj=mbec.obj,
model.vars=c('research_ID', 'Health_state'),
type="otu", pca.axes=c(1,2), return.data = FALSE)
ggsave("data/pictures/PCA_befor_batching.jpeg", plot = plot)
plot <- mbecPCA(input.obj=mbec.obj,
model.vars=c('research_ID', 'Health_state'),
type="cor", label = "bat",
pca.axes=c(1,2), return.data = FALSE)
ggsave("data/pictures/PCA_after_batching.jpeg", plot = plot)
####Heatmap of the top 10 most variable taxa
mbecHeat(input.obj=mbec.obj, method = "TOP", n = 10,
model.vars=c('research_ID', 'Health_state'),
center = TRUE, scale = TRUE,
type="otu",
return.data = FALSE)
mbecHeat(input.obj=mbec.obj, method = "TOP", n = 10,
model.vars=c('research_ID', 'Health_state'),
center = TRUE, scale = TRUE,
type="cor", label = "bat",
return.data = FALSE)
G_scaled <- data_wide_batched %>%
mutate(patient_ID = paste0("patient_", patient_ID)) %>%
column_to_rownames("patient_ID") %>%
select (ends_with("_G")) %>%
scale ()
agnes <- G_scaled %>%
dist (method = "euclidean") %>%
as.matrix() %>%
agnes( #gglomerative clustering
diss = TRUE, #dissimilarity matrix
method = "ward")
fviz_dend(agnes,
k = clusters_number,
rect = TRUE,
k_colors = "jco",
show_labels = TRUE,
label_cols = ifelse (data_wide_batched[agnes$order,]$Health_state == "Health", "green", "red"),#labels color = Health_state
cex = 0.2,
main = "Cluster dengrogram based on G-taxa abundance after batch effect correction",
ylab = "")
data_wide %>%
add_column("Cluster" = cutree(agnes, k = clusters_number)) %>%
select(any_of(colnames (combined_info)), Cluster, - patient_ID) %>%
tbl_summary(digits = list(all_continuous() ~ c(0, 0),
all_categorical() ~ c(0, 0)),
by = Cluster) %>%
add_p(test = list(all_categorical() ~ "fisher.test.simulate.p.values"))
| Characteristic | 1, N = 461 | 2, N = 1571 | 3, N = 961 | 4, N = 221 | 5, N = 351 | 6, N = 21 | 7, N = 231 | p-value2 |
|---|---|---|---|---|---|---|---|---|
| research_ID | <0.001 | |||||||
| 1 | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2 | 0 (0%) | 23 (15%) | 12 (13%) | 1 (5%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 3 | 0 (0%) | 13 (8%) | 23 (24%) | 4 (18%) | 30 (86%) | 0 (0%) | 0 (0%) | |
| 4 | 0 (0%) | 29 (18%) | 38 (40%) | 16 (73%) | 2 (6%) | 2 (100%) | 0 (0%) | |
| 5 | 0 (0%) | 2 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 23 (100%) | |
| 6 | 0 (0%) | 18 (11%) | 3 (3%) | 0 (0%) | 1 (3%) | 0 (0%) | 0 (0%) | |
| 7 | 0 (0%) | 72 (46%) | 20 (21%) | 1 (5%) | 2 (6%) | 0 (0%) | 0 (0%) | |
| Seq_region | <0.001 | |||||||
| V3-V4 | 0 (0%) | 36 (23%) | 35 (36%) | 5 (23%) | 30 (86%) | 0 (0%) | 0 (0%) | |
| V4 | 0 (0%) | 121 (77%) | 61 (64%) | 17 (77%) | 5 (14%) | 2 (100%) | 23 (100%) | |
| V5-V6 | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Seq_date | <0.001 | |||||||
| 2015 | 0 (0%) | 72 (46%) | 20 (21%) | 1 (5%) | 2 (6%) | 0 (0%) | 0 (0%) | |
| 2017 | 0 (0%) | 2 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 23 (100%) | |
| 2018 | 46 (100%) | 27 (17%) | 16 (17%) | 6 (27%) | 0 (0%) | 1 (50%) | 0 (0%) | |
| 2020 | 0 (0%) | 11 (7%) | 23 (24%) | 2 (9%) | 2 (6%) | 1 (50%) | 0 (0%) | |
| 2021 | 0 (0%) | 13 (8%) | 10 (10%) | 9 (41%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2022 | 0 (0%) | 19 (12%) | 3 (3%) | 0 (0%) | 1 (3%) | 0 (0%) | 0 (0%) | |
| 2023 | 0 (0%) | 13 (8%) | 24 (25%) | 4 (18%) | 30 (86%) | 0 (0%) | 0 (0%) | |
| Health_state | <0.001 | |||||||
| Disease | 0 (0%) | 103 (66%) | 31 (32%) | 9 (41%) | 3 (9%) | 1 (50%) | 23 (100%) | |
| Health | 46 (100%) | 54 (34%) | 65 (68%) | 13 (59%) | 32 (91%) | 1 (50%) | 0 (0%) | |
| Age | NA (NA, NA) | 35 (29, 47) | 29 (26, 40) | 48 (42, 51) | 25 (25, 26) | 32 (32, 33) | 36 (28, 49) | 0.004 |
| Unknown | 46 | 108 | 55 | 6 | 32 | 0 | 0 | |
| Age_range | <0.001 | |||||||
| > 43 | 0 (0%) | 15 (18%) | 9 (12%) | 11 (52%) | 0 (0%) | 0 (0%) | 9 (39%) | |
| 16-43 | 46 (100%) | 70 (82%) | 67 (88%) | 10 (48%) | 33 (100%) | 2 (100%) | 14 (61%) | |
| Unknown | 0 | 72 | 20 | 1 | 2 | 0 | 0 | |
| Sex | 0.030 | |||||||
| female | 0 (NA%) | 23 (32%) | 19 (41%) | 5 (26%) | 18 (58%) | 0 (0%) | 14 (61%) | |
| male | 0 (NA%) | 48 (68%) | 27 (59%) | 14 (74%) | 13 (42%) | 1 (100%) | 9 (39%) | |
| Unknown | 46 | 86 | 50 | 3 | 4 | 1 | 0 | |
| Country | <0.001 | |||||||
| Australia | 0 (0%) | 2 (1%) | 1 (1%) | 3 (14%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Austria | 0 (0%) | 2 (1%) | 0 (0%) | 2 (9%) | 0 (0%) | 1 (50%) | 23 (100%) | |
| Belgium | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Germany | 0 (0%) | 14 (9%) | 29 (30%) | 0 (0%) | 2 (6%) | 0 (0%) | 0 (0%) | |
| Greece | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Hungary | 0 (0%) | 0 (0%) | 0 (0%) | 2 (9%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Ireland | 0 (0%) | 0 (0%) | 0 (0%) | 1 (5%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Israel | 0 (0%) | 19 (12%) | 3 (3%) | 0 (0%) | 1 (3%) | 0 (0%) | 0 (0%) | |
| Italy | 0 (0%) | 23 (15%) | 12 (13%) | 2 (9%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Norway | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Poland | 0 (0%) | 13 (8%) | 23 (24%) | 4 (18%) | 30 (86%) | 0 (0%) | 0 (0%) | |
| Spain | 0 (0%) | 72 (46%) | 20 (21%) | 1 (5%) | 2 (6%) | 0 (0%) | 0 (0%) | |
| UK | 0 (0%) | 5 (3%) | 3 (3%) | 3 (14%) | 0 (0%) | 1 (50%) | 0 (0%) | |
| USA | 0 (0%) | 6 (4%) | 4 (4%) | 4 (18%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Race | 0.023 | |||||||
| African American | 0 (0%) | 0 (0%) | 1 (3%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Asian or Pacific Islander | 0 (0%) | 0 (0%) | 1 (3%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Caucasian | 46 (100%) | 21 (75%) | 29 (81%) | 16 (100%) | 2 (100%) | 2 (100%) | 0 (NA%) | |
| Hispanic | 0 (0%) | 1 (4%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Other | 0 (0%) | 6 (21%) | 5 (14%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Unknown | 0 | 129 | 60 | 6 | 33 | 0 | 23 | |
| Smoking | 0 (0%) | 6 (14%) | 8 (13%) | 1 (5%) | 1 (3%) | 0 (0%) | 0 (NA%) | 0.045 |
| Unknown | 0 | 115 | 35 | 2 | 3 | 0 | 23 | |
| Alcohol | 0.2 | |||||||
| Never | 0 (NA%) | 7 (24%) | 3 (8%) | 2 (13%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Occasionally (1-2 times/week) | 0 (NA%) | 5 (17%) | 14 (37%) | 4 (25%) | 1 (50%) | 0 (0%) | 0 (NA%) | |
| Rarely (a few times/month) | 0 (NA%) | 8 (28%) | 17 (45%) | 8 (50%) | 1 (50%) | 2 (100%) | 0 (NA%) | |
| Regularly (3-7 times/week) | 0 (NA%) | 9 (31%) | 4 (11%) | 2 (13%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Unknown | 46 | 128 | 58 | 6 | 33 | 0 | 23 | |
| Antibiotics_usage | <0.001 | |||||||
| 1-6 months | 46 (100%) | 1 (2%) | 0 (0%) | 1 (9%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| 12 months/Not use | 0 (0%) | 45 (98%) | 43 (100%) | 10 (91%) | 30 (100%) | 1 (100%) | 0 (NA%) | |
| Unknown | 0 | 111 | 53 | 11 | 5 | 1 | 23 | |
| Physical_activity | 0 (0%) | 6 (14%) | 8 (13%) | 1 (5%) | 1 (3%) | 0 (0%) | 0 (NA%) | 0.049 |
| Unknown | 0 | 115 | 35 | 2 | 3 | 0 | 23 | |
| Travel_period | 0.009 | |||||||
| 3 months | 0 (NA%) | 5 (17%) | 8 (21%) | 5 (33%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| 6 months | 0 (NA%) | 3 (10%) | 7 (18%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Month | 0 (NA%) | 2 (7%) | 3 (8%) | 4 (27%) | 2 (100%) | 2 (100%) | 0 (NA%) | |
| Year | 0 (NA%) | 19 (66%) | 20 (53%) | 6 (40%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Unknown | 46 | 128 | 58 | 7 | 33 | 0 | 23 | |
| Education_level | 0.10 | |||||||
| Bachelor's level | 0 (NA%) | 9 (31%) | 15 (42%) | 5 (33%) | 2 (100%) | 0 (0%) | 0 (NA%) | |
| Master's level | 0 (NA%) | 11 (38%) | 7 (19%) | 8 (53%) | 0 (0%) | 2 (100%) | 0 (NA%) | |
| School Level | 0 (NA%) | 9 (31%) | 14 (39%) | 2 (13%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Unknown | 46 | 128 | 60 | 7 | 33 | 0 | 23 | |
| Cosmetics | 0.13 | |||||||
| Never | 0 (NA%) | 12 (43%) | 20 (53%) | 11 (69%) | 0 (0%) | 1 (50%) | 0 (NA%) | |
| Occasionally (a few-8 times/month) | 0 (NA%) | 5 (18%) | 11 (29%) | 1 (6%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Regularly (3-7 times/week) | 0 (NA%) | 11 (39%) | 7 (18%) | 4 (25%) | 2 (100%) | 1 (50%) | 0 (NA%) | |
| Unknown | 46 | 129 | 58 | 6 | 33 | 0 | 23 | |
| Sleep_duration | 0.093 | |||||||
| > 8 hours | 0 (NA%) | 2 (7%) | 4 (11%) | 2 (13%) | 1 (50%) | 0 (0%) | 0 (NA%) | |
| 4-6 hours | 0 (NA%) | 4 (14%) | 1 (3%) | 4 (25%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| 6-7 hours | 0 (NA%) | 7 (24%) | 15 (39%) | 7 (44%) | 1 (50%) | 1 (50%) | 0 (NA%) | |
| 7-8 hours | 0 (NA%) | 16 (55%) | 18 (47%) | 3 (19%) | 0 (0%) | 1 (50%) | 0 (NA%) | |
| Unknown | 46 | 128 | 58 | 6 | 33 | 0 | 23 | |
| BMI_category | 0.14 | |||||||
| normal/overweight | 46 (100%) | 133 (98%) | 63 (98%) | 11 (85%) | 33 (100%) | 1 (100%) | 0 (NA%) | |
| obese | 0 (0%) | 2 (1%) | 1 (2%) | 2 (15%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| underweight | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Unknown | 0 | 21 | 32 | 9 | 2 | 1 | 23 | |
| 1 n (%); Median (IQR) | ||||||||
| 2 Fisher’s Exact Test for Count Data with simulated p-value (based on 2000 replicates); Kruskal-Wallis rank sum test | ||||||||
kmeans <- G_scaled %>%
kmeans(centers = clusters_number,
iter.max = 20,
nstart = 35)
fviz_cluster(kmeans,data = G_scaled,
ellipse = FALSE,
show.clust.cent = FALSE,
geom = "point",
main = "kMeans clustering based on G-taxa abundance after batch effect correction")
agnes2 <- data_wide_batched %>%
column_to_rownames("patient_ID") %>%
select (ends_with("_G")) %>%
vegdist(method = "bray") %>%
as.matrix() %>%
agnes( #Agglomerative clustering
diss = TRUE, #dissimilarity matrix
method = "ward")
fviz_dend(agnes2,
cex = 0.2, k = clusters_number,
rect = TRUE,
k_colors = "jco",
color_labels_by_k = FALSE,
label_cols = ifelse (data_wide[agnes2$order,]$Health_state == "Health", "green", "red"),
#labels colors = Health_state
main = "Cluster dengrogram based on G-taxa abundance after batch effect correction",
ylab = "")
data_wide %>%
add_column("Cluster" = cutree(agnes2, k = clusters_number)) %>%
select(any_of(colnames (combined_info)), Cluster, - patient_ID) %>%
tbl_summary(digits = list(all_continuous() ~ c(0, 0),
all_categorical() ~ c(0, 0)),
by = Cluster) %>%
add_p(test = list(all_categorical() ~ "fisher.test.simulate.p.values"))
| Characteristic | 1, N = 461 | 2, N = 211 | 3, N = 681 | 4, N = 581 | 5, N = 741 | 6, N = 951 | 7, N = 191 | p-value2 |
|---|---|---|---|---|---|---|---|---|
| research_ID | <0.001 | |||||||
| 1 | 7 (15%) | 3 (14%) | 18 (26%) | 11 (19%) | 7 (9%) | 0 (0%) | 0 (0%) | |
| 2 | 10 (22%) | 0 (0%) | 8 (12%) | 3 (5%) | 12 (16%) | 2 (2%) | 1 (5%) | |
| 3 | 13 (28%) | 2 (10%) | 24 (35%) | 11 (19%) | 13 (18%) | 6 (6%) | 1 (5%) | |
| 4 | 15 (33%) | 2 (10%) | 6 (9%) | 9 (16%) | 15 (20%) | 26 (27%) | 14 (74%) | |
| 5 | 0 (0%) | 9 (43%) | 0 (0%) | 2 (3%) | 2 (3%) | 12 (13%) | 0 (0%) | |
| 6 | 1 (2%) | 3 (14%) | 1 (1%) | 4 (7%) | 2 (3%) | 11 (12%) | 0 (0%) | |
| 7 | 0 (0%) | 2 (10%) | 11 (16%) | 18 (31%) | 23 (31%) | 38 (40%) | 3 (16%) | |
| Seq_region | <0.001 | |||||||
| V3-V4 | 23 (50%) | 2 (10%) | 32 (47%) | 14 (24%) | 25 (34%) | 8 (8%) | 2 (11%) | |
| V4 | 16 (35%) | 16 (76%) | 18 (26%) | 33 (57%) | 42 (57%) | 87 (92%) | 17 (89%) | |
| V5-V6 | 7 (15%) | 3 (14%) | 18 (26%) | 11 (19%) | 7 (9%) | 0 (0%) | 0 (0%) | |
| Seq_date | <0.001 | |||||||
| 2015 | 0 (0%) | 2 (10%) | 11 (16%) | 18 (31%) | 23 (31%) | 38 (40%) | 3 (16%) | |
| 2017 | 0 (0%) | 9 (43%) | 0 (0%) | 2 (3%) | 2 (3%) | 12 (13%) | 0 (0%) | |
| 2018 | 17 (37%) | 3 (14%) | 26 (38%) | 16 (28%) | 22 (30%) | 7 (7%) | 5 (26%) | |
| 2020 | 11 (24%) | 0 (0%) | 4 (6%) | 5 (9%) | 7 (9%) | 9 (9%) | 3 (16%) | |
| 2021 | 4 (9%) | 2 (10%) | 2 (3%) | 2 (3%) | 5 (7%) | 10 (11%) | 7 (37%) | |
| 2022 | 1 (2%) | 3 (14%) | 1 (1%) | 4 (7%) | 2 (3%) | 12 (13%) | 0 (0%) | |
| 2023 | 13 (28%) | 2 (10%) | 24 (35%) | 11 (19%) | 13 (18%) | 7 (7%) | 1 (5%) | |
| Health_state | <0.001 | |||||||
| Disease | 1 (2%) | 15 (71%) | 13 (19%) | 28 (48%) | 31 (42%) | 74 (78%) | 8 (42%) | |
| Health | 45 (98%) | 6 (29%) | 55 (81%) | 30 (52%) | 43 (58%) | 21 (22%) | 11 (58%) | |
| Age | 27 (25, 37) | 34 (26, 46) | 32 (28, 33) | 32 (28, 41) | 28 (26, 46) | 41 (29, 49) | 47 (33, 52) | 0.023 |
| Unknown | 30 | 7 | 61 | 43 | 55 | 46 | 5 | |
| Age_range | <0.001 | |||||||
| > 43 | 2 (4%) | 5 (26%) | 0 (0%) | 4 (10%) | 6 (12%) | 19 (33%) | 8 (50%) | |
| 16-43 | 44 (96%) | 14 (74%) | 57 (100%) | 36 (90%) | 45 (88%) | 38 (67%) | 8 (50%) | |
| Unknown | 0 | 2 | 11 | 18 | 23 | 38 | 3 | |
| Sex | 0.007 | |||||||
| female | 12 (46%) | 7 (47%) | 14 (41%) | 8 (35%) | 8 (24%) | 28 (62%) | 2 (14%) | |
| male | 14 (54%) | 8 (53%) | 20 (59%) | 15 (65%) | 26 (76%) | 17 (38%) | 12 (86%) | |
| Unknown | 20 | 6 | 34 | 35 | 40 | 50 | 5 | |
| Country | <0.001 | |||||||
| Australia | 2 (4%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 3 (16%) | |
| Austria | 0 (0%) | 9 (43%) | 0 (0%) | 2 (3%) | 2 (3%) | 12 (13%) | 3 (16%) | |
| Belgium | 7 (15%) | 3 (14%) | 18 (26%) | 11 (19%) | 7 (9%) | 0 (0%) | 0 (0%) | |
| Germany | 13 (28%) | 1 (5%) | 5 (7%) | 5 (9%) | 10 (14%) | 11 (12%) | 0 (0%) | |
| Greece | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | |
| Hungary | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 2 (11%) | |
| Ireland | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | |
| Israel | 1 (2%) | 3 (14%) | 1 (1%) | 4 (7%) | 2 (3%) | 12 (13%) | 0 (0%) | |
| Italy | 10 (22%) | 0 (0%) | 8 (12%) | 3 (5%) | 12 (16%) | 2 (2%) | 2 (11%) | |
| Norway | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | |
| Poland | 13 (28%) | 2 (10%) | 24 (35%) | 11 (19%) | 13 (18%) | 6 (6%) | 1 (5%) | |
| Spain | 0 (0%) | 2 (10%) | 11 (16%) | 18 (31%) | 23 (31%) | 38 (40%) | 3 (16%) | |
| UK | 0 (0%) | 0 (0%) | 0 (0%) | 4 (7%) | 2 (3%) | 2 (2%) | 4 (21%) | |
| USA | 0 (0%) | 1 (5%) | 1 (1%) | 0 (0%) | 1 (1%) | 10 (11%) | 1 (5%) | |
| Race | 0.10 | |||||||
| African American | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (4%) | 0 (0%) | |
| Asian or Pacific Islander | 0 (0%) | 0 (0%) | 0 (0%) | 1 (5%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Caucasian | 14 (70%) | 5 (100%) | 24 (100%) | 18 (90%) | 19 (86%) | 22 (88%) | 14 (100%) | |
| Hispanic | 1 (5%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Other | 5 (25%) | 0 (0%) | 0 (0%) | 1 (5%) | 3 (14%) | 2 (8%) | 0 (0%) | |
| Unknown | 26 | 16 | 44 | 38 | 52 | 70 | 5 | |
| Smoking | 2 (6%) | 1 (14%) | 3 (6%) | 1 (3%) | 5 (14%) | 3 (9%) | 1 (7%) | 0.6 |
| Unknown | 11 | 14 | 20 | 27 | 39 | 63 | 4 | |
| Alcohol | 0.7 | |||||||
| Never | 2 (13%) | 0 (0%) | 1 (17%) | 1 (11%) | 1 (7%) | 5 (19%) | 2 (14%) | |
| Occasionally (1-2 times/week) | 5 (33%) | 0 (0%) | 3 (50%) | 1 (11%) | 5 (33%) | 7 (27%) | 3 (21%) | |
| Rarely (a few times/month) | 8 (53%) | 1 (50%) | 1 (17%) | 4 (44%) | 7 (47%) | 8 (31%) | 7 (50%) | |
| Regularly (3-7 times/week) | 0 (0%) | 1 (50%) | 1 (17%) | 3 (33%) | 2 (13%) | 6 (23%) | 2 (14%) | |
| Unknown | 31 | 19 | 62 | 49 | 59 | 69 | 5 | |
| Antibiotics_usage | 0.004 | |||||||
| 1-6 months | 7 (23%) | 4 (67%) | 18 (35%) | 11 (38%) | 7 (19%) | 0 (0%) | 1 (17%) | |
| 12 months/Not use | 23 (77%) | 2 (33%) | 33 (65%) | 18 (62%) | 29 (81%) | 19 (100%) | 5 (83%) | |
| Unknown | 16 | 15 | 17 | 29 | 38 | 76 | 13 | |
| Physical_activity | 2 (6%) | 1 (14%) | 3 (6%) | 1 (3%) | 5 (14%) | 3 (9%) | 1 (7%) | 0.6 |
| Unknown | 11 | 14 | 20 | 27 | 39 | 63 | 4 | |
| Travel_period | 0.15 | |||||||
| 3 months | 4 (27%) | 0 (0%) | 2 (33%) | 1 (11%) | 2 (13%) | 5 (19%) | 4 (31%) | |
| 6 months | 2 (13%) | 0 (0%) | 1 (17%) | 1 (11%) | 4 (27%) | 2 (8%) | 0 (0%) | |
| Month | 3 (20%) | 0 (0%) | 2 (33%) | 1 (11%) | 0 (0%) | 2 (8%) | 5 (38%) | |
| Year | 6 (40%) | 2 (100%) | 1 (17%) | 6 (67%) | 9 (60%) | 17 (65%) | 4 (31%) | |
| Unknown | 31 | 19 | 62 | 49 | 59 | 69 | 6 | |
| Education_level | 0.4 | |||||||
| Bachelor's level | 9 (60%) | 1 (50%) | 3 (50%) | 3 (33%) | 5 (38%) | 6 (23%) | 4 (31%) | |
| Master's level | 2 (13%) | 0 (0%) | 1 (17%) | 4 (44%) | 3 (23%) | 11 (42%) | 7 (54%) | |
| School Level | 4 (27%) | 1 (50%) | 2 (33%) | 2 (22%) | 5 (38%) | 9 (35%) | 2 (15%) | |
| Unknown | 31 | 19 | 62 | 49 | 61 | 69 | 6 | |
| Cosmetics | 0.2 | |||||||
| Never | 5 (33%) | 2 (100%) | 1 (17%) | 4 (50%) | 9 (60%) | 13 (50%) | 10 (71%) | |
| Occasionally (a few-8 times/month) | 3 (20%) | 0 (0%) | 2 (33%) | 3 (38%) | 3 (20%) | 6 (23%) | 0 (0%) | |
| Regularly (3-7 times/week) | 7 (47%) | 0 (0%) | 3 (50%) | 1 (13%) | 3 (20%) | 7 (27%) | 4 (29%) | |
| Unknown | 31 | 19 | 62 | 50 | 59 | 69 | 5 | |
| Sleep_duration | 0.3 | |||||||
| > 8 hours | 3 (20%) | 0 (0%) | 0 (0%) | 1 (11%) | 2 (13%) | 1 (4%) | 2 (14%) | |
| 4-6 hours | 1 (7%) | 1 (50%) | 0 (0%) | 1 (11%) | 1 (7%) | 1 (4%) | 4 (29%) | |
| 6-7 hours | 5 (33%) | 1 (50%) | 2 (33%) | 1 (11%) | 5 (33%) | 12 (46%) | 5 (36%) | |
| 7-8 hours | 6 (40%) | 0 (0%) | 4 (67%) | 6 (67%) | 7 (47%) | 12 (46%) | 3 (21%) | |
| Unknown | 31 | 19 | 62 | 49 | 59 | 69 | 5 | |
| BMI_category | 0.8 | |||||||
| normal/overweight | 30 (97%) | 11 (100%) | 62 (98%) | 49 (98%) | 60 (98%) | 66 (97%) | 9 (100%) | |
| obese | 1 (3%) | 0 (0%) | 1 (2%) | 0 (0%) | 1 (2%) | 2 (3%) | 0 (0%) | |
| underweight | 0 (0%) | 0 (0%) | 0 (0%) | 1 (2%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Unknown | 15 | 10 | 5 | 8 | 13 | 27 | 10 | |
| 1 n (%); Median (IQR) | ||||||||
| 2 Fisher’s Exact Test for Count Data with simulated p-value (based on 2000 replicates); Kruskal-Wallis rank sum test | ||||||||
path <- "data/raw/Bacterial group functions.xlsx"
taxon <- c ("TaxonName", "Rank")
neuromediators <- read_xlsx (path, 2) %>%
mutate(Destroy = ifelse(is.na (Destroy), "produce", "destroy")) %>%
unique() %>%
pivot_wider(names_from = Neuromediator, values_from = Destroy)
probiotics <- read_xlsx (path, 3) %>%
add_column(probiotics = 1)
special_properties <- read_xlsx (path, 4) %>%
add_column(special_properties = 1)
vitamins <- read_xlsx (path, 5) %>%
pivot_wider (names_from = Vitamin, values_from = Vitamin,
values_fn = function(x) ifelse(is.na (x), NA, 1))
habbits <- read_xlsx (path, 7) %>%
unique() %>% #deleting duplicate rows
pivot_wider(names_from = Habbit, values_from = Habit_state)
bac_functions <- read_xlsx (path, 1) %>% #Pathogens and undesirable
full_join(neuromediators, by = taxon) %>% #Neurotransmitters
full_join(probiotics, by = taxon) %>% #Probiotics
full_join(special_properties, by = taxon) %>% #With special properties
full_join(vitamins, by = taxon) %>% #Vitamins
full_join(read_xlsx (path, 6), by = taxon) %>% #Producers of short-chain fatty acids
full_join(habbits, by = taxon) %>% #Bad habits
unite("Taxon", TaxonName, Rank, sep = "_") %>%
filter (Taxon != "Blautia obeum_S") %>% #for this taxon, there is conflicting information in the Producers of SCFA
mutate_all(as.factor)
rm (path, taxon, neuromediators, probiotics, special_properties, vitamins, habbits)
# Сonverting the data_wide to a long format
data_long <- data_wide %>%
pivot_longer(ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")),
names_to = "Taxon", values_to = "Percentage") %>%
left_join (bac_functions, by = "Taxon") #combining with `bac_functions` datasets
write_rds(data_long,
file = "data/originals/data_long.rds",
compress = "gz")
#Creation a long dataset for each taxon
G_long <- data_long %>% subset(grepl("_G", Taxon))
F_long <- data_long %>% subset(grepl("_F", Taxon))
C_long <- data_long %>% subset(grepl("_C", Taxon))
O_long <- data_long %>% subset(grepl("_O", Taxon))
P_long <- data_long %>% subset(grepl("_P", Taxon))
write_rds(G_long,
file = "data/originals/G_long.rds", "gz")
write_rds(F_long,
file = "data/originals/F_long.rds", "gz")
write_rds(C_long,
file = "data/originals/C_long.rds", "gz")
write_rds(O_long,
file = "data/originals/O_long.rds", "gz")
write_rds(P_long,
file = "data/originals/P_long.rds", "gz")
comparison <- function (special_data, special_property) {
#vector of the G_taxa names with special properties:
bac_names <- special_data %>%
subset(grepl("_G", Taxon)) %>% .$Taxon
#number of the G_taxa in data_wide
number_of_taxons <- data_wide %>%
select(patient_ID, Health_state, any_of(bac_names)) %>%
ncol() - 2
# t-test
t <- data_wide %>%
select(patient_ID, Health_state, any_of(bac_names)) %>%
pivot_longer(ends_with("_G"),
names_to = "Taxon", values_to = "Percentage") %>%
group_by(patient_ID, Health_state) %>%
summarise(across(Percentage, sum)) %>%
ungroup() %>%
t.test (Percentage ~ Health_state, .)
#a tibble with results
tibble("G_taxa" = special_property,
"Number of taxa" = number_of_taxons,
"Mean of total percentage in IBS" = t$estimate["mean in group Disease"],
"Mean of total percentage in healthy" = t$estimate["mean in group Health"],
"95% CI_1" = t$conf.int[1],
"95% CI_2" = t$conf.int[2],
"p.unadjusted" = t$p.value)
}
comparison_in_total <- rbind(
comparison (bac_functions %>% filter (Серотонин == "produce"), "Serotonin producing"),
comparison (bac_functions %>% filter (Серотонин == "destroy"), "Serotonin destroying"),
comparison (bac_functions %>% filter (Ацетилхолин == "produce"), "Acetylcholine producing"),
comparison (bac_functions %>% filter (ГАМК == "produce"), "GABA producing"),
comparison (bac_functions %>% filter (ГАМК == "destroy"), "GABA destroying"),
comparison (bac_functions %>% filter (Дофамин == "produce"), "Dopamine producing"),
comparison (bac_functions %>% filter (Дофамин == "destroy"), "Dopamine destroying"),
comparison (bac_functions %>% filter (probiotics == 1), "Probiotics"),
comparison (bac_functions %>% filter (special_properties == 1), "Special_properties"),
comparison (bac_functions %>% filter (Gases == 1), "Gases producing"),
comparison (bac_functions %>% filter (Oral == 1), "Oral"),
comparison (bac_functions %>% filter (Inflammatory == 1), "Inflammatory"),
comparison (bac_functions %>% filter (Bacteria_category == "Патоген"), "Pathogens"),
comparison (bac_functions %>% filter (Bacteria_category == "Условно-нормальный"), "Conditionally normal"),
comparison (bac_functions %>% filter (A == 1), "Vitamin A producing"),
comparison (bac_functions %>% filter (B1 == 1), "Vitamin B1 producing"),
comparison (bac_functions %>% filter (B12 == 1), "Vitamin B12 producing"),
comparison (bac_functions %>% filter (B2 == 1), "Vitamin B2 producing"),
comparison (bac_functions %>% filter (B3 == 1), "Vitamin B3 producing"),
comparison (bac_functions %>% filter (B5 == 1), "Vitamin B5 producing"),
comparison (bac_functions %>% filter (B6 == 1), "Vitamin B6 producing"),
comparison (bac_functions %>% filter (B7 == 1), "Vitamin B7 producing"),
comparison (bac_functions %>% filter (B9 == 1), "Vitamin B9 producing"),
comparison (bac_functions %>% filter (D3 == 1), "Vitamin D3 producing"),
comparison (bac_functions %>% filter (K2 == 1), "Vitamin K2 producing"),
comparison (bac_functions %>% filter (Ацетат == 1), "Acetate producing"),
comparison (bac_functions %>% filter (Пропионат == 1), "Propionate producing"),
comparison (bac_functions %>% filter (`Масляная кислота` == 1), "Butyrate producing"),
comparison (bac_functions %>% filter (Кофе == "Увеличена"), "Increase with coffee"),
comparison (bac_functions %>% filter (Курение == "Увеличена"), "Increase with smoking"),
comparison (bac_functions %>% filter (Курение == "Уменьшена"), "Derease with smoking"),
comparison (bac_functions %>% filter (Алкоголь == "Увеличена"), "Increase with alcohol"),
comparison (bac_functions %>% filter (Алкоголь == "Уменьшена"), "Derease with alcohol")
) %>%
filter (`Number of taxa` >= 3) %>%
add_column(.before = "95% CI_1", "Difference in means" = .$"Mean of total percentage in IBS" - .$"Mean of total percentage in healthy") %>%
mutate (across (c ("Mean of total percentage in IBS", "Mean of total percentage in healthy", "Difference in means", "95% CI_1", "95% CI_2"), function (x) round (x,2)),
p.unadjusted = round (p.unadjusted, 3),
p.value.holm = p.adjust(p.unadjusted, "holm")) %>%
unite("95% CI_1", "95% CI_2", col = "95% CI", sep = ":") %>%
arrange(desc (abs (.$"Difference in means")))
write_rds(comparison_in_total,
file = "data/originals/comparison_in_total.rds")
comparison_in_total %>%
mutate (p.unadjusted = ifelse(p.unadjusted < 0.001, "< 0.001", p.unadjusted),
p.value.holm = ifelse(p.value.holm < 0.001, "< 0.001", p.value.holm)) %>%
flextable() %>% theme_box() %>%
align(align = "center", part = "all") %>%
flextable::footnote (i = 1, j = 7, value = as_paragraph (c ("Welch Two Sample t-test")),
ref_symbols = "1", part = "header") %>%
set_caption("Results of IBS/healthy people comparison of means of total percentage of G-taxa with special properties")
G_taxa | Number of taxa | Mean of total percentage in IBS | Mean of total percentage in healthy | Difference in means | 95% CI | p.unadjusted1 | p.value.holm |
|---|---|---|---|---|---|---|---|
Inflammatory | 48 | 5.34 | 3.00 | 2.34 | 1.65:3.03 | < 0.001 | < 0.001 |
Propionate producing | 18 | 22.95 | 20.66 | 2.29 | 0.11:4.48 | 0.04 | 0.8 |
Acetate producing | 31 | 26.19 | 24.30 | 1.89 | -0.31:4.08 | 0.092 | 1 |
Increase with alcohol | 3 | 15.25 | 13.38 | 1.87 | -0.14:3.87 | 0.068 | 1 |
Increase with coffee | 5 | 7.71 | 9.49 | -1.78 | -2.66:-0.89 | < 0.001 | < 0.001 |
Vitamin B12 producing | 4 | 4.69 | 6.23 | -1.54 | -2.35:-0.74 | < 0.001 | < 0.001 |
Pathogens | 7 | 1.31 | 0.07 | 1.24 | 0.91:1.57 | < 0.001 | < 0.001 |
Special_properties | 34 | 11.12 | 12.28 | -1.16 | -2.15:-0.17 | 0.022 | 0.484 |
Conditionally normal | 50 | 4.37 | 3.27 | 1.10 | 0.43:1.77 | 0.001 | 0.023 |
Gases producing | 9 | 2.77 | 2.01 | 0.76 | 0.38:1.14 | < 0.001 | < 0.001 |
Serotonin destroying | 7 | 2.97 | 2.25 | 0.72 | 0.09:1.36 | 0.026 | 0.546 |
Probiotics | 5 | 1.97 | 2.48 | -0.51 | -1.15:0.12 | 0.115 | 1 |
Vitamin B9 producing | 9 | 3.34 | 3.83 | -0.50 | -1.14:0.15 | 0.131 | 1 |
Serotonin producing | 20 | 5.75 | 6.20 | -0.45 | -1.24:0.34 | 0.266 | 1 |
Increase with smoking | 10 | 2.47 | 2.02 | 0.45 | -0.1:1 | 0.11 | 1 |
GABA producing | 6 | 2.82 | 3.27 | -0.44 | -1.09:0.2 | 0.175 | 1 |
Vitamin D3 producing | 4 | 2.90 | 3.28 | -0.38 | -1.04:0.28 | 0.257 | 1 |
Vitamin A producing | 4 | 2.77 | 3.13 | -0.36 | -1.02:0.29 | 0.279 | 1 |
Vitamin B1 producing | 7 | 3.00 | 3.36 | -0.36 | -1.04:0.31 | 0.292 | 1 |
Vitamin B5 producing | 13 | 3.90 | 3.57 | 0.33 | -0.13:0.79 | 0.159 | 1 |
Vitamin B2 producing | 29 | 7.46 | 7.73 | -0.27 | -1.11:0.57 | 0.534 | 1 |
Derease with smoking | 4 | 3.52 | 3.34 | 0.18 | -0.52:0.88 | 0.621 | 1 |
Butyrate producing | 34 | 25.06 | 25.16 | -0.11 | -2.17:1.95 | 0.919 | 1 |
Vitamin B3 producing | 16 | 6.90 | 7.00 | -0.10 | -0.92:0.72 | 0.811 | 1 |
Vitamin B6 producing | 25 | 5.90 | 5.81 | 0.09 | -0.71:0.89 | 0.829 | 1 |
Vitamin B7 producing | 14 | 3.01 | 3.09 | -0.08 | -0.58:0.42 | 0.75 | 1 |
Oral | 18 | 0.80 | 0.86 | -0.07 | -0.33:0.19 | 0.612 | 1 |
Vitamin K2 producing | 3 | 0.03 | 0.03 | 0.00 | -0.02:0.01 | 0.631 | 1 |
1Welch Two Sample t-test | |||||||
combined_bacteria_G <- data_wide %>%
select(Health_state, ends_with("_G"))
d <- dist(combined_bacteria_G) # euclidean distances between the rows
fit <- cmdscale(d,eig=TRUE, k=2) # k is the number of dim
df_mds <- data.frame(
x = fit$points[,1],
y = fit$points[,2]
)
df_full <- cbind(df_mds, combined_info) %>%
mutate(Health_state_n = case_when(Health_state == "Health" ~ 0,
Health_state == "Disease" ~ 1))
ggplot(df_full, aes(x = x, y = y, color = Health_state)) +
geom_point() +
theme_bw() +
ggtitle("Распределение вектора G-таксонов в зависимости от группы пациентов")
adonis2(d ~ Health_state_n, data = df_full)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 1046 0.0122 4.6817 0.007 **
## Residual 379 84659 0.9878
## Total 380 85705 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Wilcox_comparison_round_0 <- data_wide %>%
select(Health_state, Archaea, Bacteria,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>%
mutate(across (where (is.numeric), function (x) round (x,0))) %>%
summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>%
pivot_longer(everything()) %>%
rename (Taxon = name, p_value = value) %>%
filter (p_value <= 0.05 ) %>%
arrange(p_value) %>%
add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>%
add_column(p_value_BH = p.adjust(.$p_value, "BH"))
rbind (
"Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_0),
"Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_0 %>% filter (p_value_holm <= 0.05 )),
"Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_0 %>% filter (p_value_BH <= 0.05 ))
)
## [,1]
## Количество значимо различающихся таксонов по p_value 100
## Количество значимо различающихся таксонов по p_value_holm 54
## Количество значимо различающихся таксонов по p_value_BH 100
Wilcox_comparison_round_1 <- data_wide %>%
select(Health_state, Archaea, Bacteria,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>%
mutate(across (where (is.numeric), function (x) round (x,1))) %>%
summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>%
pivot_longer(everything()) %>%
rename (Taxon = name, p_value = value) %>%
filter (p_value <= 0.05 ) %>%
arrange(p_value) %>%
add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>%
add_column(p_value_BH = p.adjust(.$p_value, "BH"))
rbind (
"Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_1),
"Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_1 %>% filter (p_value_holm <= 0.05 )),
"Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_1 %>% filter (p_value_BH <= 0.05 ))
)
## [,1]
## Количество значимо различающихся таксонов по p_value 273
## Количество значимо различающихся таксонов по p_value_holm 155
## Количество значимо различающихся таксонов по p_value_BH 273
Wilcox_comparison_round_2 <- data_wide %>%
select(Health_state, Archaea, Bacteria,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>%
mutate(across (where (is.numeric), function (x) round (x,2))) %>%
summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>%
pivot_longer(everything()) %>%
rename (Taxon = name, p_value = value) %>%
filter (p_value <= 0.05 ) %>%
arrange(p_value) %>%
add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>%
add_column(p_value_BH = p.adjust(.$p_value, "BH"))
rbind (
"Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_2),
"Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_2 %>% filter (p_value_holm <= 0.05 )),
"Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_2 %>% filter (p_value_BH <= 0.05 ))
)
## [,1]
## Количество значимо различающихся таксонов по p_value 599
## Количество значимо различающихся таксонов по p_value_holm 374
## Количество значимо различающихся таксонов по p_value_BH 599
library("haven")
library("ResourceSelection") ## Package to perform the Hosmer-Lemeshow GOF test
library("survey")
library("prediction")
library("margins")
library("ggeffects")
library("sjPlot")
library("statmod")
#devtools::install_github("strengejacke/strengejacke")
#install.packages(c("haven", "ResourceSelection", "survey", "prediction", "margins", "ggeffects", "sjPlot", "statmod"))
options(survey.lonely.psu = 'adjust')
start_values <- c(0, 0, 1)
analyze_taxon <- function(taxon_name, data) {
tryCatch(
{
data_filtered <- data %>%
filter(Taxon == taxon_name) %>%
select(patient_ID, Health_state, Percentage, Age_range, research_ID) %>%
mutate(Health_state_num = ifelse(Health_state == "Health", 0, 1)) # для упрацения интерпритации процентов
mepsdsgn = svydesign(
id = ~patient_ID,
strata = ~research_ID,
weights = NULL,
data = data_filtered,
nest = TRUE)
start_values <- c(0, 0, 1)
model <- svyglm(Percentage ~ Health_state_num + research_ID,
mepsdsgn,
family = tweedie(var.power = 2, link.power = 1),
start = start_values)
summary_table <- summary(model)
tidy_output <- tidy(model)
result_table1 <- tidy_output %>%
filter(term == "Health_state_num") %>%
select(estimate, p.value)
result_table2 <- as.data.frame(confint(model)["Health_state_num", ])
result_table2_transposed <- t(result_table2)
final_result_table_transposed <- bind_cols(result_table1, result_table2_transposed) %>%
select(estimate, `2.5 %`, `97.5 %`, p.value)
final_result_table_transposed$Taxon <- taxon_name
return(final_result_table_transposed)
},
error = function(e) {
# Обработка ошибки (можно добавить сообщение или просто вернуть NULL)
# cat("Error in analyze_taxon for Taxon:", taxon_name, "\n")
return(NULL)
}
)
}
# Применение функции ко всем таксонам
unique_taxa <- unique(G_long$Taxon)
result_list <- lapply(unique_taxa, function(taxon) {
analyze_result <- analyze_taxon(taxon, G_long)
if (!is.null(analyze_result)) {
return(analyze_result)
} else {
return(data.frame()) # Вернуть пустой data.frame, чтобы не влиять на bind_rows
}
})
# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
mutate(estimate = round(estimate, 3),
`2.5 %` = round(`2.5 %`, 4),
`97.5 %` = round(`97.5 %`, 4)
)
# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)
# Фильтрация только статистически значимых результатов
significant_results_G <- final_result_df %>%
filter(p.adjusted < 0.05) %>% arrange(desc(abs(estimate)))
#install.packages("knitr")
#install.packages("kableExtra")
# Загрузка библиотек
library(knitr)
library(kableExtra)
# Ваш код
# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_G, "html") %>%
kable_styling(full_width = F)
| estimate | 2.5 % | 97.5 % | p.value | Taxon | p.adjusted |
|---|---|---|---|---|---|
| 0.114 | 0.0529 | 0.1743 | 0.0002665 | Methanobrevibacter_G | 0.031 |
| -0.107 | -0.1274 | -0.0869 | 0.0000000 | Flavobacterium_G | 0.000 |
| 0.086 | 0.0564 | 0.1152 | 0.0000000 | Senegalimassilia_G | 0.000 |
| 0.065 | 0.0375 | 0.0929 | 0.0000052 | Solobacterium_G | 0.001 |
| 0.058 | 0.0444 | 0.0707 | 0.0000000 | Acidaminobacter_G | 0.000 |
| -0.050 | -0.0664 | -0.0329 | 0.0000000 | Alkaliflexus_G | 0.000 |
| 0.050 | 0.0277 | 0.0724 | 0.0000141 | Faecalitalea_G | 0.002 |
| 0.018 | 0.0117 | 0.0242 | 0.0000000 | Lactiplantibacillus_G | 0.000 |
| -0.016 | -0.0198 | -0.0116 | 0.0000000 | Thermacetogenium_G | 0.000 |
| 0.013 | 0.0082 | 0.0178 | 0.0000002 | Oxalobacter_G | 0.000 |
| 0.013 | 0.0095 | 0.0173 | 0.0000000 | Porphyrobacter_G | 0.000 |
| 0.013 | 0.0071 | 0.0197 | 0.0000338 | Flavonifractor_G | 0.004 |
| 0.010 | 0.0062 | 0.0129 | 0.0000000 | Anoxybacillus_G | 0.000 |
| 0.009 | 0.0063 | 0.0127 | 0.0000000 | Lachnospiraceae NK4B4 group_G | 0.000 |
| 0.009 | 0.0071 | 0.0104 | 0.0000000 | Halochromatium_G | 0.000 |
| 0.009 | 0.0068 | 0.0113 | 0.0000000 | Anaerostignum_G | 0.000 |
| -0.009 | -0.0113 | -0.0075 | 0.0000000 | Erysipelotrichaceae UCG-002_G | 0.000 |
| 0.007 | 0.0057 | 0.0084 | 0.0000000 | Colwellia_G | 0.000 |
| 0.007 | 0.0036 | 0.0112 | 0.0001343 | Marinobacter_G | 0.016 |
| -0.007 | -0.0101 | -0.0038 | 0.0000201 | Candidatus Armantifilum_G | 0.003 |
| 0.006 | 0.0047 | 0.0079 | 0.0000000 | Hespellia_G | 0.000 |
| -0.006 | -0.0077 | -0.0052 | 0.0000000 | Sedimentibacter_G | 0.000 |
| -0.006 | -0.0086 | -0.0030 | 0.0000666 | Rikenella_G | 0.008 |
| 0.004 | 0.0032 | 0.0048 | 0.0000000 | Syntrophomonas_G | 0.000 |
| 0.004 | 0.0022 | 0.0052 | 0.0000021 | Lentilactobacillus_G | 0.000 |
| 0.004 | 0.0019 | 0.0054 | 0.0000631 | Rubrobacter_G | 0.008 |
| 0.004 | 0.0030 | 0.0047 | 0.0000000 | Pseudarthrobacter_G | 0.000 |
| 0.004 | 0.0024 | 0.0058 | 0.0000022 | Acholeplasma_G | 0.000 |
| -0.004 | -0.0046 | -0.0029 | 0.0000000 | Crocinitomix_G | 0.000 |
| -0.004 | -0.0062 | -0.0024 | 0.0000098 | Macellibacteroides_G | 0.001 |
| 0.003 | 0.0022 | 0.0046 | 0.0000000 | Hassallia_G | 0.000 |
| 0.003 | 0.0015 | 0.0040 | 0.0000102 | Halobacillus_G | 0.001 |
| 0.003 | 0.0012 | 0.0040 | 0.0001789 | RB41_G | 0.021 |
| 0.003 | 0.0013 | 0.0041 | 0.0001600 | Antarcticibacterium_G | 0.019 |
| -0.003 | -0.0036 | -0.0016 | 0.0000005 | Leptococcus JA-3-3Ab_G | 0.000 |
| -0.003 | -0.0037 | -0.0016 | 0.0000009 | Desulfurispora_G | 0.000 |
| -0.003 | -0.0036 | -0.0016 | 0.0000004 | Achromobacter_G | 0.000 |
| -0.003 | -0.0047 | -0.0019 | 0.0000036 | Coriobacteriaceae UCG-002_G | 0.001 |
| -0.003 | -0.0031 | -0.0021 | 0.0000000 | Formosa_G | 0.000 |
| -0.003 | -0.0039 | -0.0022 | 0.0000000 | Paramaledivibacter_G | 0.000 |
| -0.003 | -0.0043 | -0.0014 | 0.0001027 | Hydrogenispora_G | 0.013 |
| -0.003 | -0.0033 | -0.0021 | 0.0000000 | Serpentinicella_G | 0.000 |
| -0.003 | -0.0037 | -0.0021 | 0.0000000 | Oxobacter_G | 0.000 |
| -0.003 | -0.0035 | -0.0023 | 0.0000000 | Lachnotalea_G | 0.000 |
| -0.003 | -0.0032 | -0.0023 | 0.0000000 | Subsaximicrobium_G | 0.000 |
| -0.003 | -0.0046 | -0.0017 | 0.0000214 | Salinibacter_G | 0.003 |
| -0.003 | -0.0040 | -0.0013 | 0.0000805 | XBB1006_G | 0.010 |
| -0.003 | -0.0042 | -0.0016 | 0.0000109 | Pygmaiobacter_G | 0.001 |
| -0.003 | -0.0045 | -0.0016 | 0.0000334 | Marinifilum_G | 0.004 |
| 0.002 | 0.0011 | 0.0023 | 0.0000001 | Anaerospora_G | 0.000 |
| 0.002 | 0.0012 | 0.0023 | 0.0000000 | Truepera_G | 0.000 |
| 0.002 | 0.0010 | 0.0028 | 0.0000743 | Nocardia_G | 0.009 |
| -0.002 | -0.0031 | -0.0015 | 0.0000001 | Acetoanaerobium_G | 0.000 |
| -0.002 | -0.0026 | -0.0010 | 0.0000107 | Chthoniobacter_G | 0.001 |
| 0.002 | 0.0012 | 0.0038 | 0.0001601 | Spirochaeta 2_G | 0.019 |
| -0.001 | -0.0013 | -0.0006 | 0.0000014 | Arcicella_G | 0.000 |
| -0.001 | -0.0014 | -0.0006 | 0.0000029 | Pseudoclostridium_G | 0.000 |
| -0.001 | -0.0014 | -0.0006 | 0.0000020 | Clostridiisalibacter_G | 0.000 |
| 0.001 | 0.0007 | 0.0017 | 0.0000031 | Sporanaerobacter_G | 0.000 |
| 0.001 | 0.0008 | 0.0017 | 0.0000000 | Leucobacter_G | 0.000 |
| -0.001 | -0.0018 | -0.0007 | 0.0000117 | Virgibacillus_G | 0.002 |
| -0.001 | -0.0015 | -0.0007 | 0.0000001 | [Eubacterium] yurii group_G | 0.000 |
| -0.001 | -0.0020 | -0.0008 | 0.0000040 | Aeriscardovia_G | 0.001 |
| -0.001 | -0.0020 | -0.0009 | 0.0000008 | Blvii28 wastewater-sludge group_G | 0.000 |
| -0.001 | -0.0021 | -0.0008 | 0.0000194 | Izimaplasma_G | 0.003 |
| -0.001 | -0.0021 | -0.0007 | 0.0000428 | Lawsonella_G | 0.005 |
# Применение функции ко всем таксонам
unique_taxa <- unique(F_long$Taxon)
result_list <- lapply(unique_taxa, function(taxon) {
analyze_result <- analyze_taxon(taxon, F_long)
if (!is.null(analyze_result)) {
return(analyze_result)
} else {
return(data.frame()) # Вернуть пустой data.frame, чтобы не влиять на bind_rows
}
})
# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
mutate(estimate = round(estimate, 3),
`2.5 %` = round(`2.5 %`, 4),
`97.5 %` = round(`97.5 %`, 4)
)
# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)
# Фильтрация только статистически значимых результатов
significant_results_F <- final_result_df %>%
filter(p.adjusted < 0.05) %>% arrange(desc(abs(estimate)))
# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_F, "html") %>%
kable_styling(full_width = F)
| estimate | 2.5 % | 97.5 % | p.value | Taxon | p.adjusted |
|---|---|---|---|---|---|
| 0.891 | 0.7287 | 1.0532 | 0.0000000 | Pedosphaeraceae_F | 0.000 |
| 0.301 | 0.2544 | 0.3471 | 0.0000000 | A4b_F | 0.000 |
| -0.080 | -0.1152 | -0.0445 | 0.0000117 | Methanobacteriaceae_F | 0.001 |
| 0.063 | 0.0480 | 0.0783 | 0.0000000 | Chromatiaceae_F | 0.000 |
| 0.058 | 0.0444 | 0.0707 | 0.0000000 | Acidaminobacteraceae_F | 0.000 |
| -0.055 | -0.0739 | -0.0360 | 0.0000000 | Clade II_F | 0.000 |
| -0.021 | -0.0329 | -0.0097 | 0.0003409 | Puniceicoccaceae_F | 0.017 |
| -0.016 | -0.0198 | -0.0117 | 0.0000000 | Thermacetogeniaceae_F | 0.000 |
| 0.010 | 0.0054 | 0.0150 | 0.0000326 | Sphingomonadaceae_F | 0.002 |
| 0.007 | 0.0036 | 0.0111 | 0.0001574 | Micromonosporaceae_F | 0.008 |
| 0.007 | 0.0036 | 0.0112 | 0.0001343 | Marinobacteraceae_F | 0.007 |
| 0.006 | 0.0034 | 0.0078 | 0.0000009 | WD2101 soil group_F | 0.000 |
| 0.006 | 0.0051 | 0.0073 | 0.0000000 | Iamiaceae_F | 0.000 |
| 0.006 | 0.0038 | 0.0079 | 0.0000000 | Colwelliaceae_F | 0.000 |
| 0.006 | 0.0037 | 0.0082 | 0.0000004 | Hydrogenedensaceae_F | 0.000 |
| 0.006 | 0.0043 | 0.0076 | 0.0000000 | NS11-12 marine group_F | 0.000 |
| -0.006 | -0.0077 | -0.0052 | 0.0000000 | Sedimentibacteraceae_F | 0.000 |
| -0.004 | -0.0057 | -0.0016 | 0.0004772 | Didymellaceae_F | 0.024 |
| 0.004 | 0.0022 | 0.0059 | 0.0000247 | Thermomonosporaceae_F | 0.001 |
| 0.004 | 0.0031 | 0.0055 | 0.0000000 | Syntrophomonadaceae_F | 0.000 |
| -0.004 | -0.0042 | -0.0032 | 0.0000000 | Aureobasidiaceae_F | 0.000 |
| 0.004 | 0.0019 | 0.0054 | 0.0000631 | Rubrobacteriaceae_F | 0.004 |
| 0.003 | 0.0013 | 0.0040 | 0.0001420 | Pyrinomonadaceae_F | 0.008 |
| 0.003 | 0.0019 | 0.0033 | 0.0000000 | Obscuribacteraceae_F | 0.000 |
| 0.003 | 0.0020 | 0.0034 | 0.0000000 | Limnochordaceae_F | 0.000 |
| -0.003 | -0.0036 | -0.0016 | 0.0000005 | Leptococcaceae_F | 0.000 |
| -0.003 | -0.0037 | -0.0016 | 0.0000009 | Desulfurisporaceae_F | 0.000 |
| -0.003 | -0.0044 | -0.0013 | 0.0003096 | Salinivirgaceae_F | 0.016 |
| 0.003 | 0.0020 | 0.0038 | 0.0000000 | Xanthobacteraceae_F | 0.000 |
| -0.003 | -0.0037 | -0.0021 | 0.0000000 | Oxobacteraceae_F | 0.000 |
| -0.003 | -0.0043 | -0.0017 | 0.0000112 | Chthoniobacteraceae_F | 0.001 |
| -0.003 | -0.0040 | -0.0014 | 0.0000790 | Pirellulaceae_F | 0.004 |
| -0.002 | -0.0033 | -0.0015 | 0.0000001 | Trichocomaceae_F | 0.000 |
| 0.002 | 0.0012 | 0.0023 | 0.0000000 | Trueperaceae_F | 0.000 |
| 0.002 | 0.0016 | 0.0030 | 0.0000000 | type III_F | 0.000 |
| -0.002 | -0.0026 | -0.0012 | 0.0000004 | Izemoplasmataceae_F | 0.000 |
| 0.002 | 0.0007 | 0.0025 | 0.0006969 | Rs-E47 termite group_F | 0.033 |
| 0.002 | 0.0019 | 0.0030 | 0.0000000 | 09D2Z48_F | 0.000 |
| -0.002 | -0.0038 | -0.0010 | 0.0006464 | TRA3-20_F | 0.031 |
| 0.002 | 0.0020 | 0.0028 | 0.0000000 | Simkaniaceae_F | 0.000 |
| -0.002 | -0.0033 | -0.0015 | 0.0000001 | PeH15_F | 0.000 |
| -0.001 | -0.0012 | -0.0003 | 0.0005701 | Bacteriovoracaceae_F | 0.028 |
| 0.001 | 0.0006 | 0.0021 | 0.0006965 | Halothiobacillaceae_F | 0.033 |
| -0.001 | -0.0019 | -0.0009 | 0.0000003 | Cladosporiaceae_F | 0.000 |
| -0.001 | -0.0020 | -0.0008 | 0.0000131 | 01D2Z36_F | 0.001 |
| 0.001 | 0.0009 | 0.0017 | 0.0000000 | Streptosporangiaceae_F | 0.000 |
# Применение функции ко всем таксонам
unique_taxa <- unique(C_long$Taxon)
result_list <- lapply(unique_taxa, function(taxon) {
analyze_result <- analyze_taxon(taxon, C_long)
if (!is.null(analyze_result)) {
return(analyze_result)
} else {
return(data.frame()) # Вернуть пустой data.frame, чтобы не влиять на bind_rows
}
})
# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
mutate(estimate = round(estimate, 3),
`2.5 %` = round(`2.5 %`, 4),
`97.5 %` = round(`97.5 %`, 4)
)
# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)
# Фильтрация только статистически значимых результатов
significant_results_C <- final_result_df %>%
filter(p.adjusted < 0.05) %>% arrange(desc(abs(estimate)))
# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_C, "html") %>%
kable_styling(full_width = F)
| estimate | 2.5 % | 97.5 % | p.value | Taxon | p.adjusted |
|---|---|---|---|---|---|
| -0.080 | -0.1152 | -0.0445 | 1.17e-05 | Methanobacteria_C | 0.000 |
| 0.036 | 0.0275 | 0.0435 | 0.00e+00 | Thermodesulfovibrionia_C | 0.000 |
| 0.022 | 0.0172 | 0.0265 | 0.00e+00 | Armatimonadia_C | 0.000 |
| -0.017 | -0.0222 | -0.0124 | 0.00e+00 | Dothideomycetes_C | 0.000 |
| -0.016 | -0.0198 | -0.0117 | 0.00e+00 | Thermacetogenia_C | 0.000 |
| 0.014 | 0.0105 | 0.0166 | 0.00e+00 | Acidimicrobiia_C | 0.000 |
| 0.012 | 0.0090 | 0.0144 | 0.00e+00 | Chlamydiae_C | 0.000 |
| 0.010 | 0.0072 | 0.0126 | 0.00e+00 | Dehalococcoidia_C | 0.000 |
| 0.009 | 0.0064 | 0.0108 | 0.00e+00 | Desulfotomaculia_C | 0.000 |
| 0.006 | 0.0037 | 0.0082 | 4.00e-07 | Hydrogenedentia_C | 0.000 |
| 0.006 | 0.0034 | 0.0078 | 1.00e-06 | Vicinamibacteria_C | 0.000 |
| -0.006 | -0.0074 | -0.0036 | 0.00e+00 | D8A-2_C | 0.000 |
| 0.006 | 0.0042 | 0.0076 | 0.00e+00 | Thermoleophilia_C | 0.000 |
| 0.005 | 0.0030 | 0.0071 | 2.80e-06 | Ignavibacteria_C | 0.000 |
| -0.004 | -0.0052 | -0.0026 | 0.00e+00 | Eurotiomycetes_C | 0.000 |
| 0.004 | 0.0031 | 0.0055 | 0.00e+00 | Syntrophomonadia_C | 0.000 |
| 0.004 | 0.0019 | 0.0054 | 6.31e-05 | Rubrobacteria_C | 0.001 |
| 0.004 | 0.0026 | 0.0055 | 0.00e+00 | vadinHA49_C | 0.000 |
| 0.003 | 0.0017 | 0.0042 | 4.70e-06 | MB-A2-108_C | 0.000 |
# Применение функции ко всем таксонам
unique_taxa <- unique(O_long$Taxon)
result_list <- lapply(unique_taxa, function(taxon) {
analyze_result <- analyze_taxon(taxon, O_long)
if (!is.null(analyze_result)) {
return(analyze_result)
} else {
return(data.frame()) # Вернуть пустой data.frame, чтобы не влиять на bind_rows
}
})
# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
mutate(estimate = round(estimate, 3),
`2.5 %` = round(`2.5 %`, 4),
`97.5 %` = round(`97.5 %`, 4)
)
# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)
# Фильтрация только статистически значимых результатов
significant_results_O <- final_result_df %>%
filter(p.adjusted < 0.05) %>% arrange(desc(abs(estimate)))
# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_O, "html") %>%
kable_styling(full_width = F)
| estimate | 2.5 % | 97.5 % | p.value | Taxon | p.adjusted |
|---|---|---|---|---|---|
| 0.891 | 0.7287 | 1.0532 | 0.0000000 | Pedosphaerales_O | 0.000 |
| -0.080 | -0.1152 | -0.0445 | 0.0000117 | Methanobacteriales_O | 0.000 |
| 0.062 | 0.0465 | 0.0773 | 0.0000000 | Chromatiales_O | 0.000 |
| 0.024 | 0.0176 | 0.0306 | 0.0000000 | Rhizobiales_O | 0.000 |
| 0.022 | 0.0172 | 0.0265 | 0.0000000 | Armatimonadales_O | 0.000 |
| -0.016 | -0.0198 | -0.0117 | 0.0000000 | Thermacetogeniales_O | 0.000 |
| -0.012 | -0.0141 | -0.0100 | 0.0000000 | Candidatus Kerfeldbacteria_O | 0.000 |
| -0.011 | -0.0162 | -0.0051 | 0.0001987 | Hypocreales_O | 0.007 |
| 0.010 | 0.0087 | 0.0120 | 0.0000000 | LD1-PA32_O | 0.000 |
| 0.010 | 0.0054 | 0.0150 | 0.0000326 | Sphingomonadales_O | 0.001 |
| 0.008 | 0.0059 | 0.0106 | 0.0000000 | Microtrichales_O | 0.000 |
| 0.008 | 0.0054 | 0.0096 | 0.0000000 | Desulfotomaculales_O | 0.000 |
| 0.007 | 0.0036 | 0.0111 | 0.0001574 | Micromonosporales_O | 0.006 |
| -0.007 | -0.0093 | -0.0046 | 0.0000000 | Chthoniobacterales_O | 0.000 |
| -0.006 | -0.0092 | -0.0033 | 0.0000447 | Pleosporales_O | 0.002 |
| 0.006 | 0.0037 | 0.0082 | 0.0000004 | Hydrogenedentiales_O | 0.000 |
| 0.005 | 0.0032 | 0.0077 | 0.0000024 | Tepidisphaerales_O | 0.000 |
| -0.005 | -0.0051 | -0.0040 | 0.0000000 | Dothideales_O | 0.000 |
| 0.004 | 0.0031 | 0.0055 | 0.0000000 | Syntrophomonadales_O | 0.000 |
| 0.004 | 0.0019 | 0.0054 | 0.0000631 | Rubrobacterales_O | 0.002 |
| 0.004 | 0.0014 | 0.0058 | 0.0015949 | Entomoplasmatales_O | 0.048 |
| 0.003 | 0.0013 | 0.0040 | 0.0001420 | Pyrinomonadales_O | 0.005 |
| 0.003 | 0.0019 | 0.0033 | 0.0000000 | Obscuribacterales_O | 0.000 |
| -0.003 | -0.0036 | -0.0016 | 0.0000005 | Eurycoccales_O | 0.000 |
| 0.003 | 0.0022 | 0.0037 | 0.0000000 | Limnochordales_O | 0.000 |
| 0.003 | 0.0019 | 0.0034 | 0.0000000 | eub62A3_O | 0.000 |
| 0.003 | 0.0015 | 0.0053 | 0.0003859 | Synechococcales_O | 0.012 |
| -0.003 | -0.0042 | -0.0017 | 0.0000072 | RBG-13-54-9_O | 0.000 |
| -0.003 | -0.0040 | -0.0014 | 0.0000790 | Pirellulales_O | 0.003 |
| 0.002 | 0.0009 | 0.0027 | 0.0000904 | Candidatus Abawacabacteria_O | 0.003 |
| 0.002 | 0.0010 | 0.0021 | 0.0000002 | Candidatus Peregrinibacteria_O | 0.000 |
| -0.002 | -0.0029 | -0.0012 | 0.0000018 | Capnodiales_O | 0.000 |
| 0.002 | 0.0011 | 0.0020 | 0.0000000 | Candidatus Terrybacteria_O | 0.000 |
| -0.002 | -0.0033 | -0.0015 | 0.0000001 | Eurotiales_O | 0.000 |
| 0.002 | 0.0017 | 0.0033 | 0.0000000 | Gaiellales_O | 0.000 |
| 0.002 | 0.0009 | 0.0032 | 0.0003267 | Ignavibacteriales_O | 0.011 |
| -0.001 | -0.0012 | -0.0003 | 0.0005701 | Bacteriovoracales_O | 0.018 |
| 0.001 | 0.0008 | 0.0018 | 0.0000027 | S085_O | 0.000 |
# Применение функции ко всем таксонам
unique_taxa <- unique(P_long$Taxon)
result_list <- lapply(unique_taxa, function(taxon) {
analyze_result <- analyze_taxon(taxon, P_long)
if (!is.null(analyze_result)) {
return(analyze_result)
} else {
return(data.frame()) # Вернуть пустой data.frame, чтобы не влиять на bind_rows
}
})
# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
mutate(estimate = round(estimate, 3),
`2.5 %` = round(`2.5 %`, 4),
`97.5 %` = round(`97.5 %`, 4)
)
# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)
# Фильтрация только статистически значимых результатов
significant_results_P <- final_result_df %>%
filter(p.adjusted < 0.05) %>% arrange(desc(abs(estimate)))
# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_P, "html") %>%
kable_styling(full_width = F)
| estimate | 2.5 % | 97.5 % | p.value | Taxon | p.adjusted |
|---|---|---|---|---|---|
| -15.536 | -22.0641 | -9.0082 | 4.00e-06 | Firmicutes_P | 0.000 |
| -0.080 | -0.1152 | -0.0445 | 1.17e-05 | Euryarchaeota_P | 0.000 |
| 0.038 | 0.0292 | 0.0458 | 0.00e+00 | Nitrospirota_P | 0.000 |
| 0.038 | 0.0302 | 0.0467 | 0.00e+00 | Armatimonadota_P | 0.000 |
| -0.015 | -0.0212 | -0.0090 | 1.50e-06 | Basidiomycota_P | 0.000 |
| 0.006 | 0.0037 | 0.0082 | 4.00e-07 | Hydrogenedentes_P | 0.000 |
| 0.006 | 0.0032 | 0.0097 | 9.66e-05 | Myxococcota_P | 0.001 |
create_and_plot_taxon_groups <- function(filter_pattern, step_size, dataset_name) {
# Получаем уникальные таксоны, удовлетворяющие условиям
unique_taxa <- Wilcox_comparison_round_2 %>%
subset(grepl(filter_pattern, Taxon)) %>%
filter(p_value_holm < 0.05) %>%
distinct(Taxon)
# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
# Проходим по группам и строим графики
for (i in seq_along(taxon_groups)) {
current_taxa <- taxon_groups[[i]]
current_dataset <- get(dataset_name) # Получаем датасет по его имени
G_long_filtered <- current_dataset %>%
filter(Taxon %in% current_taxa$Taxon & Percentage > 0) %>%
group_by(Health_state, Taxon) %>%
summarise(Count = n(), .groups = "drop")
bar_plot <- ggplot(G_long_filtered, aes(x = Health_state, y = Count, fill = Health_state == "Disease")) +
geom_bar(position = "dodge", color = "black", stat = "identity") +
scale_fill_manual(values = c("skyblue", "red"), guide = FALSE) +
labs(title = paste("Гистограмма для таксонов", (i - 1) * step_size + 1, "до", min(i * step_size, nrow(unique_taxa)), "в разрезе Health_state"),
x = "Статус пациента",
y = "Количество пациентов, у которых обнаружен данный таксон") +
coord_flip() +
facet_wrap(~Taxon, scales = "free_y", ncol = 2, shrink = 0.7) + # Уменьшение пространства между фасетами
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5), # Поворот текста на оси X и уменьшение шрифта
axis.text.y = element_text(hjust = 1, size = 5),
strip.text = element_text(size = 5)) # Уменьшение шрифта для названия фасет
print(bar_plot) # Отображение графика на экране
}
}
# Пример вызова функции с передачей "_G" в качестве аргумента и названия датасета "G_long"
create_and_plot_taxon_groups("_G", 18, "G_long")
create_and_plot_circular_diagrams <- function(dataset, filter_pattern, step_size, filename) {
# Получаем уникальные таксоны, удовлетворяющие условиям
unique_taxa <- Wilcox_comparison_round_2 %>%
subset(grepl(filter_pattern, Taxon)) %>%
filter(p_value_holm < 0.05) %>%
distinct(Taxon)
# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
# Проходим по группам и строим графики
for (i in seq_along(taxon_groups)) {
current_taxa <- taxon_groups[[i]]
G_long_filtered <- dataset %>%
filter(Taxon %in% current_taxa$Taxon & Percentage > 0) %>%
group_by(Health_state, Taxon) %>%
summarise(Count = n(), .groups = "drop")
# Добавляем строки с Count = 0 для всех комбинаций Health_state и Taxon
G_long_filtered <- G_long_filtered %>%
complete(Health_state, Taxon, fill = list(Count = 0))
data_id <- G_long_filtered %>% filter(G_long_filtered$Health_state == "Disease") %>%
mutate(id = row_number()) %>%
select(Taxon, id)
G_long_filtered <- left_join(G_long_filtered, data_id, by = "Taxon")
labels_data <- G_long_filtered %>% filter(Health_state == "Disease")
number_of_bars <- nrow(labels_data)
#Вычислим углы для лейбла каждого барра
labels_data$angel <- 90 - 360 * (labels_data$id-0.5) / number_of_bars
# Добавим горизонтольную регулировку
labels_data <- labels_data %>%
mutate(hjust = ifelse(angel < -90, 1, 0))
# Перевернем лейбл в зависимости от "полушария"
labels_data <- labels_data %>%
mutate(angel = ifelse(angel < -90, angel + 180, angel))
# Круговая диаграмма для объединенного датасета
p <- ggplot(data = G_long_filtered, mapping = aes(x = id, y = Count, fill = Health_state)) +
geom_bar(stat = "identity", position = "stack") +
ylim(-150, 500) +
# Добавляем кастомную тему
theme_minimal(base_size = 8) +
theme(axis.text = element_blank(),
axis.title = element_blank(),
panel.grid = element_blank(),
plot.margin = unit(rep(-1, 6), "cm")) +
coord_polar(start = 0)+
geom_text(data = labels_data,
aes(x = id, y = Count,
label = Taxon,
hjust = hjust),
color = "black",
fontface = "bold",
alpha = 0.9,
size = 1.5,
angle = labels_data$angel,
inherit.aes = FALSE) +
# Добавляем горизонтальные линии сетки
geom_hline(yintercept = seq(0, 300, by = 50),
linetype = "dotted",
color = "gray",
size = 0.5) +
# Добавляем значения рядом с линиями сетки
annotate("text", x = 1.2, y = seq(0, 300, by = 50),
label = seq(0, 300, by = 50),
color = "black",
size = 3,
alpha = 40,
hjust = 0)
p <- p + annotate("text", x = 0.5, y = 490, label = "Сотношение здоровых пациентов и пациентов с СРК",
size = 5, color = "black", fontface = "bold", hjust = 0.5)
# Сформируем имя файла для сохранения
current_filename <- paste0(filename, "_", i, ".png")
# Сохраняем график под уникальным именем
ggsave(current_filename, p, width = 10, height = 8)
print(p)
}
}
# Пример вызова функции
create_and_plot_circular_diagrams(G_long, "_G", 40, "output_plot")
create_and_plot_boxplots <- function(G_long, filter_pattern, step_size, x_var, title) {
# Получаем уникальные таксоны, удовлетворяющие условиям
unique_taxa <- Wilcox_comparison_round_2 %>%
subset(grepl(filter_pattern, Taxon)) %>%
filter(p_value_holm < 0.05) %>%
distinct(Taxon)
# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
# Проходим по группам и строим боксплоты
for (i in seq_along(taxon_groups)) {
current_taxa <- taxon_groups[[i]]
G_long_filtered <- G_long %>%
filter(Taxon %in% current_taxa$Taxon & Percentage > 0)
# Создаем формулу для переменной x_var
formula <- as.formula(paste("Age ~", x_var))
# Строим боксплот
box_plot <- ggplot(G_long_filtered, aes_string(x = x_var, y = "Age", fill = "Health_state")) +
geom_boxplot() +
labs(title = paste(title),
x = x_var,
y = "Age") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8), # Уменьшаем размер шрифта для подписей Taxon
axis.title.x = element_blank(), # Убираем название оси X
legend.position = "bottom") + # Перемещаем легенду вниз
scale_fill_manual(values = c("red", "skyblue")) +
facet_grid(~Taxon, scales = "free_y", space = "free") # Используем facet_grid вместо facet_wrap
# Отображаем график на экране
print(box_plot)
}
}
# Пример вызова функции с Health_state в качестве x_var
create_and_plot_boxplots(G_long, "_G", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
create_and_plot_boxplots(G_long, "_G", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
create_and_plot_boxplots(G_long, "_G", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
create_and_plot_boxplots(G_long, "_G", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
create_and_plot_taxon_groups("_F", 18, "F_long")
create_and_plot_circular_diagrams(F_long, "_F", 40, "F_output_plot")
create_and_plot_boxplots(F_long, "_F", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
create_and_plot_boxplots(F_long, "_F", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
create_and_plot_boxplots(F_long, "_F", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
create_and_plot_boxplots(F_long, "_F", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
create_and_plot_taxon_groups("_C", 18, "C_long")
create_and_plot_circular_diagrams(C_long, "_C", 40, "F_output_plot")
create_and_plot_boxplots(C_long, "_C", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
create_and_plot_boxplots(C_long, "_C", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
create_and_plot_boxplots(C_long, "_C", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
create_and_plot_boxplots(C_long, "_C", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
create_and_plot_taxon_groups("_O", 18, "O_long")
create_and_plot_circular_diagrams(O_long, "_O", 30, "F_output_plot")
create_and_plot_boxplots(O_long, "_O", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
create_and_plot_boxplots(O_long, "_O", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
create_and_plot_boxplots(O_long, "_O", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
create_and_plot_boxplots(O_long, "_O", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
create_and_plot_taxon_groups("_P", 18, "P_long")
create_and_plot_circular_diagrams(P_long, "_P", 30, "P_output_plot")
create_and_plot_boxplots(P_long, "_P", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
create_and_plot_boxplots(P_long, "_P", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
create_and_plot_boxplots(P_long, "_P", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
create_and_plot_boxplots(P_long, "_P", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
perform_permutation_test <- function(data, filter_pattern) {
combined_bacteria_G <- data %>%
select(Health_state, ends_with(filter_pattern))
d <- dist(combined_bacteria_G) # euclidean distances between the rows
fit <- cmdscale(d, eig=TRUE, k=2) # k is the number of dim
df_mds <- data.frame(
x = fit$points[,1],
y = fit$points[,2]
)
df_full <- cbind(df_mds, combined_info) %>%
mutate(Health_state_n = case_when(Health_state == "Health" ~ 0,
Health_state == "Disease" ~ 1))
msd_plot <- ggplot(df_full, aes(x = x, y = y, color = Health_state)) +
geom_point() +
theme_bw() +
ggtitle("Распределение вектора таксонов в зависимости от группы пациентов")
print(msd_plot)
print("Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state")
adonis2(d ~ Health_state_n, data = df_full)
}
# Пример вызова функции с использованием другого фильтра
perform_permutation_test(data_wide %>%
select(Health_state,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))), "_G")
## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 1046 0.0122 4.6817 0.006 **
## Residual 379 84659 0.9878
## Total 380 85705 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
perform_permutation_test(data_wide %>%
select(Health_state,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))), "_F")
## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 1919 0.01423 5.4706 0.002 **
## Residual 379 132940 0.98577
## Total 380 134859 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
perform_permutation_test(data_wide %>%
select(Health_state,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))), "_C")
## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 32982 0.12662 54.948 0.001 ***
## Residual 379 227490 0.87338
## Total 380 260472 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
perform_permutation_test(data_wide %>%
select(Health_state,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))), "_O")
## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 6923 0.04767 18.97 0.001 ***
## Residual 379 138309 0.95233
## Total 380 145232 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
perform_permutation_test(data_wide %>%
select(Health_state,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))), "_P")
## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 32728 0.12502 54.155 0.001 ***
## Residual 379 229042 0.87498
## Total 380 261770 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library("lme4")
library("stringr")
#Переменная имеет биномиальное распределение
plot(data_long$Серотонин)
#Процент пропущенных значений в переменной Серотонин 96,67%
na_serotonin <- sum(is.na(data_long$Серотонин)) / nrow(data_long) * 100
#Данные о серотонине есть только в F и G taxons
serotonin_taxons <- data_long %>%
filter(!is.na(`Серотонин`)) %>%
mutate(Serotonin_taxons = str_sub(Taxon, -2)) %>%
distinct(Serotonin_taxons) %>%
pull(Serotonin_taxons)
#При этом только 2 F таксона и 42 G таксона
serotonin_taxons_count_unique <- bac_functions %>%
filter(!is.na(Серотонин)) %>%
mutate(Serotonin_taxons = str_sub(Taxon, -2)) %>%
count(Serotonin_taxons)
#А всего 219 F в датасете
num_unique_F <- data_long %>%
filter(str_detect(Taxon, "_F$")) %>%
summarise(Num_Unique = n_distinct(Taxon)) %>%
pull(Num_Unique)
###Добавляем данные по родам из двух семейств в bac_functions (данные добавлены в 8-й лист Bacterial group functions.xlsx)
# Чтение листов Excel-файла с функциями бактерий и их объединение
path <- "data/raw/Bacterial group functions.xlsx"
taxon <- c ("TaxonName", "Rank")
neuromediators_1 <- read_xlsx (path, 8) %>%
mutate(Destroy = ifelse(is.na (Destroy), "produce", "destroy")) %>%
unique() %>%
pivot_wider(names_from = Neuromediator, values_from = Destroy)
probiotics <- read_xlsx (path, 3) %>%
add_column(probiotics = 1)
special_properties <- read_xlsx (path, 4) %>%
add_column(special_properties = 1)
vitamins <- read_xlsx (path, 5) %>%
pivot_wider (names_from = Vitamin, values_from = Vitamin,
values_fn = function(x) ifelse(is.na (x), NA, 1))
habbits <- read_xlsx (path, 7) %>%
unique() %>% #удаление повторяющихся строк
pivot_wider(names_from = Habbit, values_from = Habit_state)
bac_functions_1 <- read_xlsx (path, 1) %>% #Патогены и нежелательные
full_join(neuromediators_1, by = taxon) %>% #Нейромедиаторы
full_join(probiotics, by = taxon) %>% #Пробиотики
full_join(special_properties, by = taxon) %>% #С особыми свойствами
full_join(vitamins, by = taxon) %>% #Витамины
full_join(read_xlsx (path, 6), by = taxon) %>% #Продуценты КЦДК
full_join(habbits, by = taxon) %>% #Вредные привычки
unite("Taxon", TaxonName, Rank, sep = "_") %>%
filter (Taxon != "Blautia obeum_S") %>% #для данного таксона противоречивая информация в Продуценты КЦЖК
mutate_all(as.factor)
rm (path, taxon, neuromediators_1, probiotics, special_properties, vitamins, habbits)
data_long_1 <- data_wide %>%
pivot_longer(ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")),
names_to = "Taxon", values_to = "Percentage")
#Перезапись data_long с добавлением функций бактерий
data_long_1 <- data_long_1 %>%
left_join (bac_functions_1, by = "Taxon")
G_long_1 <- data_long_1 %>% subset(grepl("_G", Taxon))
#Модель отдельно для таксонов G уровня
G_long_serotonin <- G_long_1 %>%
filter( ! is.na(`Серотонин`)) %>%
filter(Percentage > 0.0000)
G_long_serotonin$`Серотонин` <- relevel(G_long_serotonin$`Серотонин`, ref = "destroy")
G_long_serotonin$research_ID <- as.factor(G_long_serotonin$research_ID)
G_long_serotonin$Seq_date <- as.factor(G_long_serotonin$Seq_date)
G_long_serotonin$Seq_date <- as.factor(G_long_serotonin$Seq_date)
G_long_serotonin$patient_ID <- as.factor(G_long_serotonin$patient_ID)
model_ser_G_1 <- glmer(Health_state ~ Серотонин + (1 | research_ID), data = G_long_serotonin, family = binomial)
summary(model_ser_G_1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Health_state ~ Серотонин + (1 | research_ID)
## Data: G_long_serotonin
##
## AIC BIC logLik deviance df.resid
## 8002.6 8027.1 -3998.3 7996.6 26740
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.58164 -0.00200 0.00227 0.00291 0.67902
##
## Random effects:
## Groups Name Variance Std.Dev.
## research_ID (Intercept) 482.6 21.97
## Number of obs: 26743, groups: research_ID, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.30153 2.04143 0.148 0.8826
## Серотонинproduce 0.14273 0.07277 1.961 0.0498 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Серотонинpr 0.000
Модель не показывает значимой ассоциации наличия серотонинпродуцирующих бактерий и состоянием здоровья человека (СРК/здоровый).
#Boruta for Random Forest, only G-level taxa
data_boruta <- data_wide_batched %>%
select(ends_with("_G") | "Health_state") # selection of taxa _G in dataset corrected for batch-effect (id reasearch)
library(Boruta)
Boruta(Health_state ~ ., data_boruta, ntree = 1000, maxRuns = 1000) %>% #Boruta
TentativeRoughFix() -> boruta_trained
boruta_trained %>%
attStats() %>%
rownames_to_column("Переменная") %>%
mutate(`Переменная` = `Переменная` %>% fct_reorder(`meanImp`)) %>%
filter(decision == "Confirmed") %>%
select("Переменная") -> boruta_trained_confirmed #dataset with significant variables
boruta_trained %>%
attStats() %>%
rownames_to_column("Переменная") %>%
mutate(`Переменная` = `Переменная` %>% fct_reorder(`meanImp`)) %>% filter(decision == "Confirmed") %>%
ggplot(aes(y = `Переменная`, x = meanImp, colour = decision, size = 4)) +
geom_point(aes(size = 8)) +
geom_errorbar(aes(xmin = minImp, xmax = maxImp, width = 3)) +
xlab("Среднее снижение энтропии") +
labs(color = "Значимость переменной") +
theme(legend.position = "bottom", axis.text = element_text(size =70))
data_wide_batched %>%
select(c(boruta_trained_confirmed$Переменная, Health_state)) -> data_RF
data_RF %>%
map(function(x) sum(is.na(x)) / length(x)) %>%
enframe() %>%
unnest(cols = value) %>%
arrange(desc(value))
## # A tibble: 179 × 2
## name value
## <chr> <dbl>
## 1 Hassallia_G 0
## 2 Cetobacterium_G 0
## 3 Erysipelotrichaceae UCG-009_G 0
## 4 Glutamicibacter_G 0
## 5 Halobacillus_G 0
## 6 Mucispirillum_G 0
## 7 Acetanaerobacterium_G 0
## 8 Dethiosulfatibacter_G 0
## 9 Syntrophomonas_G 0
## 10 FD2005_G 0
## # ℹ 169 more rows
data_RF%>%
select(where(is.factor)) %>%
map(table)
## $Health_state
##
## Disease Health
## 170 211
library(rsample)
data_RF <- data_RF[sample(nrow(data_RF)), ]
split_train_RandFor <- initial_split(data_RF, strata = Health_state, prop = 0.9)
data_RandFor_train <- split_train_RandFor %>% training()
data_RandFor_test <- split_train_RandFor %>% testing()
data_RandFor_test_alternative <- data_wide %>% #data for test of model, only research 4, becase both value of variable Health_state
filter(research_ID == 4)
cat_metric <- yardstick::metric_set(
yardstick::bal_accuracy,
yardstick::precision,
yardstick::recall,
yardstick::f_meas,
yardstick::sensitivity,
yardstick::specificity,
yardstick::j_index
)
library(themis)
library(embed)
data_recipe_RandFor <- recipe(Health_state ~ ., data_RandFor_train) %>%
step_impute_bag(all_numeric_predictors()) %>%
step_zv(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_lincomb(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
library(parsnip)
rf_model <- rand_forest(mode = "classification", mtry = tune(), trees = tune(), min_n = tune()) %>%
set_engine("ranger")
cv_samples <- vfold_cv(data_RandFor_train, strata = Health_state, v = 10)
library(dials)
parameters_grid <- grid_max_entropy(mtry(range = c(3, 10)), trees(), min_n(), size = 20)
library(workflows)
reg_workflow <- workflow() %>%
add_recipe(data_recipe_RandFor) %>%
add_model(rf_model)
library(tidymodels)
grid_search <- reg_workflow %>%
tune_grid(
object = reg_workflow,
resamples = cv_samples,
grid = parameters_grid,
control = control_grid(save_pred = TRUE),
metrics = metric_set(sensitivity, specificity, j_index)
)
grid_search %>%
collect_metrics() %>%
ggplot(aes(trees, mean, color = .metric)) +
geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), alpha = 0.8) +
geom_line(size = 1)
best_by_j_index <- grid_search %>% select_best("j_index")
final_reg_model <- finalize_workflow(
reg_workflow,
best_by_j_index
)
# test data
final_reg_model %>%
fit(data_RandFor_train) %>%
predict(data_RandFor_test_alternative) %>%
pull() -> final_RandFor_test_prediction
metrics_on_test <- cat_metric(truth = truth_values, estimate = estimate_values, tibble(truth_values = data_RandFor_test_alternative$Health_state, estimate_values = final_RandFor_test_prediction)) %>% rename(test_estimate = `.estimate`) %>% select(!`.estimator`)
# train data
final_reg_model %>%
fit(data_RandFor_train) %>%
predict(data_RandFor_train) %>%
pull() -> final_RandFor_train_prediction
metrics_on_train <- cat_metric(truth = truth_values, estimate = estimate_values, tibble(truth_values = data_RandFor_train$Health_state, estimate_values = final_RandFor_train_prediction)) %>% rename(train_estimate = `.estimate`) %>% select(!`.estimator`)
# binding
metrics_on_test %>%
left_join(metrics_on_train, by = ".metric") %>%
mutate(differencies = train_estimate - test_estimate)
## # A tibble: 7 × 4
## .metric test_estimate train_estimate differencies
## <chr> <dbl> <dbl> <dbl>
## 1 bal_accuracy 1 1 0
## 2 precision 1 1 0
## 3 recall 1 1 0
## 4 f_meas 1 1 0
## 5 sensitivity 1 1 0
## 6 specificity 1 1 0
## 7 j_index 1 1 0
last_fit(
final_reg_model,
split_train_RandFor
) -> final_RF_model
final_RF_model %>% write_rds("RandomForest.rds") #saving model
final_RF_model %>%
extract_workflow() %>%
predict(data_RandFor_test_alternative, type = "class") %>%
pull() -> class_prediction
final_RF_model %>%
extract_workflow() %>%
predict(data_RandFor_test_alternative, type = "prob") %>%
pull() -> prob_prediction
RF_model_results <- tibble(truth = data_RandFor_test_alternative$Health_state,
estimate = class_prediction,
prob_yes = 1 - prob_prediction)
cat_metric(truth = truth, estimate = estimate, RF_model_results)
## # A tibble: 7 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 bal_accuracy binary 1
## 2 precision binary 1
## 3 recall binary 1
## 4 f_meas binary 1
## 5 sensitivity binary 1
## 6 specificity binary 1
## 7 j_index binary 1
RF_model_results %>%
roc_curve(truth = truth, prob_yes) %>%
autoplot()